// CuNNy 3x12 - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.


//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-03x12
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, x, y) t.SampleLevel(SP, pos + float2(x, y) * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T4;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T5;

//!PASS 1
//!DESC in (3x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT T0, T1, T2

#define L0(x, y) V3(O(INPUT, x, y).rgb)
#define V3 MF3
#define M3x4 MF3x4

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V3 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	r0 = V4(-3.838e-04, -1.473e-02, 2.105e-04, -1.662e-02);
	r1 = V4(-5.161e-04, -7.702e-03, 5.444e-04, 5.153e-01);
	r2 = V4(-1.625e-04, -4.639e-04, -6.901e-05, 1.916e-01);
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M3x4(-8.004e-02, 3.306e-02, 2.809e-03, 1.799e-02, 4.088e-02, -1.101e-01, 1.779e-01, -6.916e-03, 6.212e-02, -1.051e-01, 8.316e-03, -1.999e-02), r0);
	r1 = MulAdd(s0_0_0, M3x4(-1.018e-03, 5.281e-02, -3.671e-03, -9.022e-04, 8.121e-03, 3.896e-01, 6.807e-03, -2.565e-03, -2.056e-03, 2.885e-02, -5.021e-03, 6.286e-03), r1);
	r2 = MulAdd(s0_0_0, M3x4(-6.619e-03, -6.941e-03, -9.951e-04, 2.560e-03, 1.368e-02, -8.019e-03, 8.860e-03, 3.502e-02, -1.219e-02, -1.648e-02, -1.284e-04, 1.618e-02), r2);
	r0 = MulAdd(s0_0_1, M3x4(-5.145e-02, -8.557e-02, 7.632e-02, 1.667e-02, 2.628e-02, 4.722e-01, 1.992e-01, -1.234e-01, 1.607e-02, 1.615e-01, -6.932e-03, -1.517e+00), r0);
	r1 = MulAdd(s0_0_1, M3x4(9.342e-03, 1.045e-01, 1.247e-02, 2.767e-02, -2.459e-02, 9.835e-02, -2.164e-02, 6.366e-02, 1.557e-03, 7.984e-02, 1.361e-02, 1.242e-02), r1);
	r2 = MulAdd(s0_0_1, M3x4(1.802e-01, 2.350e-02, -2.171e-03, 3.287e-02, 7.780e-01, 1.447e-02, -1.550e-03, 1.412e-01, 6.470e-02, 5.975e-03, -4.353e-03, 4.922e-03), r2);
	r0 = MulAdd(s0_0_2, M3x4(3.057e-01, 1.163e-01, -5.224e-02, -4.806e-02, -1.667e-01, 5.029e-02, 1.030e-01, 2.855e-02, -1.322e-01, -5.030e-02, 3.477e-02, 4.541e-01), r0);
	r1 = MulAdd(s0_0_2, M3x4(4.095e-03, -1.366e-01, 1.371e-02, 7.380e-03, 7.256e-03, -3.971e-01, -7.612e-03, -1.147e-02, -6.970e-03, -1.003e-01, -6.160e-03, -3.729e-03), r1);
	r2 = MulAdd(s0_0_2, M3x4(-1.672e-02, -5.558e-04, 1.079e-03, -1.867e-03, 2.246e-02, 3.502e-02, 4.412e-03, 1.433e-02, -1.110e-02, -5.331e-03, -2.294e-03, -2.783e-03), r2);
	r0 = MulAdd(s0_1_0, M3x4(-5.050e-01, -1.129e-01, 1.362e-01, -1.381e-01, 2.457e-01, 4.098e-01, 3.087e-01, -1.106e-01, 2.201e-01, 2.176e-02, 6.282e-02, 2.000e-01), r0);
	r1 = MulAdd(s0_1_0, M3x4(1.641e-02, -2.346e-02, 2.563e-02, -4.359e-04, -2.779e-02, 6.961e-02, -2.510e-02, 2.655e-02, 1.316e-02, 8.984e-03, -6.255e-04, 2.326e-03), r1);
	r2 = MulAdd(s0_1_0, M3x4(1.201e-02, -2.015e-02, -1.423e-02, -3.623e-02, -1.895e-02, 3.178e-02, 2.327e-02, 6.212e-02, 1.200e-02, 4.885e-03, 3.593e-03, 8.842e-03), r2);
	r0 = MulAdd(s0_1_1, M3x4(-8.829e-01, 1.794e-01, -3.974e-01, 6.902e-02, 4.236e-01, -2.352e+00, -8.976e-01, 3.539e-01, 4.229e-01, 1.710e-01, -8.174e-02, 1.806e-01), r0);
	r1 = MulAdd(s0_1_1, M3x4(-2.061e-01, -1.003e-01, -2.644e-01, -1.278e-01, -7.559e-01, -9.074e-01, -8.613e-01, -4.990e-01, -7.363e-02, -1.090e-01, -7.582e-02, -2.663e-02), r1);
	r2 = MulAdd(s0_1_1, M3x4(-1.855e-01, -3.328e-01, 2.014e-02, -1.061e-01, -7.715e-01, -1.080e+00, -3.037e-02, -6.953e-01, -6.446e-02, -8.815e-02, 1.773e-03, -3.069e-02), r2);
	r0 = MulAdd(s0_1_2, M3x4(4.815e-01, -9.921e-02, 9.823e-02, -2.057e-03, -2.357e-01, 3.492e-01, -2.543e-01, 2.569e-02, -2.200e-01, -9.123e-02, -3.606e-02, 1.597e-02), r0);
	r1 = MulAdd(s0_1_2, M3x4(1.336e-02, 1.433e-02, 2.036e-01, 1.491e-02, -2.435e-02, 4.806e-01, 9.176e-01, 3.480e-02, 2.033e-02, 5.692e-02, 7.300e-02, 1.008e-02), r1);
	r2 = MulAdd(s0_1_2, M3x4(3.352e-03, 3.727e-02, 2.577e-03, 3.610e-02, -1.435e-02, -2.008e-02, 1.104e-03, -6.423e-03, 1.849e-02, 1.964e-02, 1.142e-03, -1.254e-04), r2);
	r0 = MulAdd(s0_2_0, M3x4(9.755e-02, 7.786e-02, 4.177e-02, 3.039e-02, -5.141e-02, 1.119e-01, 1.274e-01, 4.471e-03, -2.445e-02, -1.436e-02, 2.118e-02, -6.314e-02), r0);
	r1 = MulAdd(s0_2_0, M3x4(-6.104e-03, -6.230e-02, -6.457e-03, -2.358e-03, 5.778e-03, -2.301e-01, 6.849e-03, -1.679e-02, -4.539e-03, -1.902e-02, -6.001e-04, -1.960e-03), r1);
	r2 = MulAdd(s0_2_0, M3x4(2.637e-03, 3.330e-01, 2.302e-01, 3.634e-02, -9.790e-04, 1.051e+00, 7.835e-01, 7.599e-02, -1.271e-03, 7.008e-02, 7.817e-02, 5.323e-03), r2);
	r0 = MulAdd(s0_2_1, M3x4(-1.597e-01, -4.237e-02, -2.068e-02, 1.278e-02, 7.768e-02, -2.273e-02, 1.392e-01, 1.641e-02, 8.793e-02, -3.311e-02, 4.318e-03, -2.049e-02), r0);
	r1 = MulAdd(s0_2_1, M3x4(1.882e-01, 4.531e-02, 1.395e-02, 5.440e-02, 7.944e-01, -3.128e-02, -2.484e-02, 7.494e-02, 6.330e-02, 6.745e-04, 3.291e-03, 1.395e-02), r1);
	r2 = MulAdd(s0_2_1, M3x4(4.550e-03, -5.659e-02, -2.260e-01, -7.046e-03, -6.486e-03, -1.208e-02, -7.871e-01, 1.704e-01, 2.202e-03, 4.179e-02, -8.120e-02, 1.672e-02), r2);
	r0 = MulAdd(s0_2_2, M3x4(2.425e-01, 2.374e-02, -2.404e-02, 1.450e-02, -1.131e-01, 1.969e-02, 8.958e-02, -1.023e-01, -1.361e-01, -2.301e-02, -4.773e-02, 4.883e-02), r0);
	r1 = MulAdd(s0_2_2, M3x4(-1.753e-02, 9.386e-02, 4.877e-03, 2.227e-03, 1.747e-02, 5.314e-01, 6.404e-03, -3.009e-02, -1.093e-02, 5.469e-02, -1.424e-03, -7.789e-03), r1);
	r2 = MulAdd(s0_2_2, M3x4(5.474e-03, 1.939e-02, -1.011e-02, 9.980e-03, 9.453e-04, -1.498e-02, -2.597e-03, 6.000e-02, -8.301e-03, -3.578e-02, 3.540e-03, -9.692e-03), r2);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
	r2 = max(r2, 0.0);
	T2[gxy] = r2;
}

//!PASS 2
//!DESC conv1 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T0, T1, T2
//!OUT T3, T4, T5

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))
#define L2(x, y) V4(O(T2, x, y))

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(9.525e-02, 1.266e-02, 3.445e-03, 7.378e-03, 1.958e-02, 2.888e-02, -3.022e-04, -4.919e-02, 1.474e-02, 5.379e-02, 2.465e-02, -2.110e-02, 1.007e-01, 4.968e-02, 7.625e-03, -2.355e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-5.259e-02, -3.138e-02, 1.081e-01, -1.583e-01, -8.368e-02, 1.400e-01, -5.287e-02, 5.207e-02, -8.263e-02, 5.925e-02, -6.862e-02, 2.268e-01, 8.502e-02, -1.662e-02, 4.158e-02, 1.310e-02), r1);
	r2 = MulAdd(s0_0_0, M4(6.493e-03, 1.380e-02, 3.747e-03, -2.227e-02, 1.202e-02, -1.451e-04, 2.250e-02, 3.513e-02, 2.935e-02, 3.707e-02, 2.023e-02, 5.505e-02, -5.075e-02, 3.709e-03, -9.123e-02, 3.416e-02), r2);
	r0 = MulAdd(s0_0_1, M4(-1.232e-02, 4.537e-02, 8.322e-03, -2.819e-02, -7.270e-02, -1.638e-01, 7.426e-03, 4.253e-02, -2.711e-01, -6.879e-02, 4.308e-02, 3.822e-03, -4.461e-02, -4.596e-02, -1.255e-02, 2.594e-02), r0);
	r1 = MulAdd(s0_0_1, M4(-1.456e-01, 8.759e-02, -1.232e-01, 1.440e-01, -9.656e-02, -8.897e-02, 9.890e-02, -8.393e-01, 8.269e-02, -8.710e-02, 2.890e-01, 4.217e-01, 4.398e-02, 2.851e-02, -5.137e-03, 1.096e-02), r1);
	r2 = MulAdd(s0_0_1, M4(5.458e-02, -2.263e-02, -5.579e-02, 3.635e-02, 1.768e-01, 3.603e-02, 1.203e-02, 2.682e-02, 7.902e-02, 8.412e-03, 3.872e-01, 8.573e-02, 7.203e-02, -8.531e-03, -2.039e-01, 7.960e-02), r2);
	r0 = MulAdd(s0_0_2, M4(-9.491e-02, -1.431e-02, 1.541e-02, 7.125e-03, -2.342e-01, -5.845e-02, -3.911e-02, 1.345e-02, -6.537e-02, -2.341e-03, -1.913e-02, -7.491e-02, -2.823e-02, -7.889e-04, 2.246e-03, -2.572e-02), r0);
	r1 = MulAdd(s0_0_2, M4(3.182e-02, -1.205e-01, -8.174e-02, 9.559e-02, -4.225e-03, 8.761e-03, 9.877e-02, -4.413e-03, 9.772e-02, -5.354e-02, -1.508e-01, 1.507e-02, -1.111e-01, -5.514e-03, -1.764e-02, -4.508e-02), r1);
	r2 = MulAdd(s0_0_2, M4(-8.983e-02, 9.532e-03, 6.249e-02, 1.644e-02, -1.396e-01, -3.630e-02, -3.226e-01, -8.937e-02, 3.096e-02, -2.228e-02, 6.260e-02, 2.801e-02, -2.691e-02, -1.946e-02, 3.042e-01, 6.429e-02), r2);
	r0 = MulAdd(s0_1_0, M4(4.774e-02, -2.753e-03, 3.733e-02, 5.234e-02, 2.063e-01, 6.343e-02, -2.685e-02, -2.345e-02, -1.343e-01, -1.969e-01, 1.783e-02, 4.953e-02, 1.283e-01, -1.768e-02, 6.969e-03, -3.803e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-6.848e-02, 1.805e-01, -3.072e-01, 8.786e-02, -4.533e+00, -2.733e-01, 9.678e-02, -7.903e-02, 6.315e-01, -2.436e-01, 6.929e-02, 2.639e-03, 4.882e-02, 8.620e-03, -1.157e-02, 2.297e-01), r1);
	r2 = MulAdd(s0_1_0, M4(6.044e-02, 3.137e-02, 2.344e-01, 4.057e-03, -3.721e-02, 4.214e-02, 6.506e-02, 6.430e-03, 4.382e-02, 2.849e-02, -3.235e-01, -1.559e-01, 2.660e-02, 8.349e-02, 1.540e-02, 5.069e-02), r2);
	r0 = MulAdd(s0_1_1, M4(-2.867e-01, -1.510e-01, -3.847e-02, -6.369e-03, -3.088e-01, -3.446e-01, -4.467e-02, 3.751e-03, -1.635e-01, -3.527e-02, -4.410e-02, 3.503e-02, 3.858e-02, 1.053e-01, -2.426e-02, 1.345e-01), r0);
	r1 = MulAdd(s0_1_1, M4(4.979e-01, -2.382e-01, 2.582e-01, -7.223e-02, -5.607e-01, 5.871e-02, 8.606e-01, -1.709e-01, -6.113e-01, 1.313e-01, 8.415e-01, -1.489e-01, -1.753e-01, -5.653e-02, 3.266e-02, -3.678e-01), r1);
	r2 = MulAdd(s0_1_1, M4(-2.102e-01, -1.642e-02, -3.817e-01, -1.005e-04, 9.513e-02, 1.033e-02, -6.548e-01, 5.214e-02, 4.227e-02, 8.670e-02, -2.668e-01, -2.339e-01, -1.422e-01, 4.787e-03, 4.968e-02, 1.020e-02), r2);
	r0 = MulAdd(s0_1_2, M4(2.441e-01, 1.154e-02, -4.067e-02, 2.503e-02, 2.840e-01, -3.493e-02, 2.509e-02, 1.958e-02, 3.630e-01, 1.712e-01, 1.766e-02, 3.671e-02, -5.253e-02, -3.856e-02, 2.564e-02, 2.975e-02), r0);
	r1 = MulAdd(s0_1_2, M4(-1.461e-01, 1.606e-01, 2.931e-01, -2.132e-01, 4.188e-02, 6.921e-02, 6.395e-03, 1.733e-02, 1.809e-01, -1.000e-01, -3.229e-01, -5.873e-02, -3.083e-02, 4.404e-02, 3.474e-02, 1.116e-01), r1);
	r2 = MulAdd(s0_1_2, M4(2.153e-01, -2.386e-02, 7.123e-02, 9.820e-03, -3.459e-04, 2.167e-02, -8.050e+00, 2.230e-02, -1.258e-01, 1.100e-02, -9.690e-01, 7.935e-02, -1.630e-02, 2.041e-02, -1.367e-01, 8.399e-02), r2);
	r0 = MulAdd(s0_2_0, M4(-5.044e-02, -6.445e-03, 1.264e-03, -2.473e-02, 8.443e-02, -7.054e-02, -6.142e-03, -6.069e-02, -2.642e-02, 6.221e-02, -5.325e-02, 9.792e-04, -2.286e-02, -6.836e-02, 6.665e-02, 1.343e-01), r0);
	r1 = MulAdd(s0_2_0, M4(-3.037e-02, -7.044e-03, 6.013e-02, -3.630e-02, -2.393e-01, -7.752e-02, 1.786e-01, 1.179e-02, -1.429e-01, -7.992e-02, 9.990e-02, -2.557e-02, -5.351e-01, 2.478e-01, -1.597e-01, 1.214e-01), r1);
	r2 = MulAdd(s0_2_0, M4(-9.399e-02, -5.158e-04, -1.260e-01, 5.753e-02, 2.553e-01, -2.938e-03, -5.913e-04, 3.223e-02, 5.890e-02, 8.018e-03, 1.550e-01, -1.762e-01, -8.908e-02, -4.383e-02, 1.972e-01, -3.748e-02), r2);
	r0 = MulAdd(s0_2_1, M4(2.007e-01, 4.391e-02, 1.759e-02, 1.245e-02, 1.649e-01, -3.839e-02, -1.313e-01, -1.255e-02, -1.855e-02, 4.078e-02, 2.373e-02, -3.109e-02, 1.513e-01, -2.762e-01, -1.865e-01, -8.377e-02), r0);
	r1 = MulAdd(s0_2_1, M4(-1.297e-01, -9.058e-02, -5.008e-02, -2.317e-02, -1.818e-01, -1.354e-01, -1.195e-01, -1.166e-01, -2.419e-02, 8.370e-02, -2.687e-01, -9.035e-02, -3.246e-01, -1.675e-01, 2.107e-01, -4.834e-01), r1);
	r2 = MulAdd(s0_2_1, M4(1.626e-01, 1.181e-02, 2.537e-01, -3.441e-02, -8.744e-02, -4.786e-02, -5.942e-02, -1.541e-01, -3.018e-01, -3.299e-02, -2.477e-01, -2.223e-01, 1.174e-01, -4.864e-02, -4.832e-01, -4.261e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-1.123e-01, 1.728e-02, -7.725e-03, -1.937e-02, 2.393e-02, 9.625e-03, -3.186e-02, -5.783e-02, -8.287e-02, -4.418e-02, 3.169e-02, -4.808e-03, 2.759e-01, 5.622e-02, 9.001e-02, 4.963e-02), r0);
	r1 = MulAdd(s0_2_2, M4(-5.944e-02, 9.065e-03, -1.003e-01, 1.170e-01, -2.443e-03, 9.089e-02, -1.127e-01, 4.456e-02, -7.984e-04, -8.293e-02, -3.158e-01, 4.708e-02, -1.143e-01, 2.366e-01, -1.010e-01, -1.101e-02), r1);
	r2 = MulAdd(s0_2_2, M4(-1.531e-01, 1.518e-02, -1.342e-01, -4.859e-02, -1.913e-01, -9.979e-03, 3.440e-02, -5.432e-02, -9.941e-02, 1.854e-02, 5.622e-02, 6.923e-02, 1.733e-01, 1.805e-02, 1.890e-01, -6.919e-02), r2);
	r0 = MulAdd(s1_0_0, M4(3.391e-01, 2.425e-01, -8.351e-02, 2.316e-01, 1.787e-01, 4.504e-02, -1.824e-03, 2.724e-02, 3.840e-01, 6.608e-02, 3.379e-02, 4.463e-01, -4.328e-01, -2.923e-01, 5.945e-02, -2.570e-01), r0);
	r1 = MulAdd(s1_0_0, M4(-2.262e-01, -4.842e-03, 3.272e-01, -5.540e-01, 2.632e-02, 8.292e-02, 4.413e-02, -1.459e-01, -3.279e-01, -2.804e-02, -2.740e-01, -7.485e-01, 9.982e-02, -4.858e-01, -5.180e-02, 2.607e-01), r1);
	r2 = MulAdd(s1_0_0, M4(3.058e-02, -1.037e-01, 1.230e-01, -2.595e-01, -6.311e-02, -7.707e-03, -1.219e-02, -8.179e-02, -1.037e-01, 1.548e-01, -1.459e-01, -5.167e-02, -1.150e-01, -1.380e-01, 2.104e-01, 3.368e-01), r2);
	r0 = MulAdd(s1_0_1, M4(-4.619e-02, -5.872e-02, -6.361e-03, 8.629e-03, 2.252e-01, 4.519e-02, -2.644e-02, 1.465e-02, 2.064e-01, -8.797e-02, 6.324e-02, 2.020e-02, 2.898e-01, 1.881e-01, 3.965e-02, 1.021e-01), r0);
	r1 = MulAdd(s1_0_1, M4(1.489e-01, 6.459e-02, -2.539e+00, -1.099e+01, 1.161e-01, 9.301e-02, 2.458e-01, 9.420e-02, 2.607e-01, 1.895e-01, 3.102e-01, -5.133e-01, -6.039e-03, 6.094e-02, -2.048e-01, -4.620e-01), r1);
	r2 = MulAdd(s1_0_1, M4(-8.125e-02, 1.500e-01, -7.482e-02, -3.429e-01, -6.732e-02, 2.407e-02, -2.744e-01, -3.914e-02, -8.471e-02, 7.836e-03, -4.434e-01, -3.630e-02, -3.270e-02, 7.112e-02, -2.415e-01, 4.222e-01), r2);
	r0 = MulAdd(s1_0_2, M4(9.232e-01, -1.510e-01, 1.315e-01, 4.544e-02, -5.102e-02, 1.300e-02, 5.796e-03, -3.541e-02, -4.598e-02, -7.197e-03, 1.332e-02, -8.201e-03, 2.325e-01, -2.044e-02, -1.190e-01, -1.319e-02), r0);
	r1 = MulAdd(s1_0_2, M4(1.536e-01, -1.440e-01, -7.880e-02, -3.087e-01, 3.748e-02, 1.257e-02, 5.745e-02, -6.155e-02, 6.704e-02, -3.070e-02, 7.411e-02, 1.362e-02, -3.260e-02, 4.048e-01, 2.097e-01, 1.072e-01), r1);
	r2 = MulAdd(s1_0_2, M4(-4.834e-01, 2.804e-02, -1.024e-01, -1.312e-01, -8.083e-02, -5.414e-03, 1.539e-02, 3.552e-02, 6.698e-02, 1.451e-02, -4.072e-01, -5.120e-02, 1.823e-01, -9.009e-02, -9.889e-02, 3.002e-01), r2);
	r0 = MulAdd(s1_1_0, M4(3.700e-01, 1.198e-02, -5.900e-02, -1.685e-01, 1.926e-01, 7.565e-02, -4.254e-03, 2.076e-02, 1.345e-01, 1.137e+00, 1.037e-01, 3.461e-01, 3.581e-01, 2.222e-01, -4.287e-01, 6.309e-01), r0);
	r1 = MulAdd(s1_1_0, M4(-3.201e+00, -2.368e-01, 3.491e-01, 3.497e-03, -1.627e-01, 9.939e-02, 1.942e-01, -1.381e-02, -5.971e+00, 2.788e-01, -2.161e+00, 4.417e-01, -3.290e-01, -7.201e-01, 2.632e-01, -2.907e-01), r1);
	r2 = MulAdd(s1_1_0, M4(-3.897e-01, 1.248e-01, 4.911e-02, -1.202e-01, -1.392e-01, 2.658e-02, 2.311e-04, -6.858e-02, 4.365e-01, 2.058e-01, 1.479e-01, 2.179e-01, 2.061e-01, -7.059e-01, 8.722e-02, -2.070e-02), r2);
	r0 = MulAdd(s1_1_1, M4(-3.332e+00, 2.122e-01, 9.245e-01, 1.774e-01, 2.544e-01, 1.545e-01, 3.007e-02, 7.024e-03, -3.430e+00, 8.651e-02, 1.736e-01, 8.504e-04, 4.638e-02, -1.583e-01, 4.245e-01, -3.628e-01), r0);
	r1 = MulAdd(s1_1_1, M4(2.218e-01, -2.080e-02, -7.942e-02, 1.266e-01, 4.827e-02, -1.145e-01, 1.820e-01, 1.021e-01, -6.865e-02, -1.046e-01, -4.027e-01, 2.035e-01, -4.482e-01, 1.105e+00, 2.684e-01, 4.417e-01), r1);
	r2 = MulAdd(s1_1_1, M4(5.336e-01, -1.328e-01, 2.026e-01, -2.223e-01, -3.037e-01, 2.106e-02, -3.656e-01, 1.059e-02, 4.970e-01, 1.213e-01, 5.399e-01, -1.509e-01, 4.336e-03, 8.515e-01, 1.899e-01, -5.825e-01), r2);
	r0 = MulAdd(s1_1_2, M4(-2.647e-01, 2.581e-02, -1.091e-01, -2.832e-02, 8.882e-02, 6.996e-02, -5.836e-03, -1.641e-02, 6.180e-02, 1.182e-02, 4.131e-02, 3.867e-02, -1.626e-01, 2.079e-02, 3.531e-02, -4.426e-02), r0);
	r1 = MulAdd(s1_1_2, M4(-9.842e-02, -4.417e-02, -1.064e-01, 1.662e-01, 6.878e-04, 8.423e-02, 1.025e-01, -5.273e-02, 1.255e-01, -8.911e-02, 6.974e-02, 5.234e-02, 3.513e-01, -3.335e-01, -3.144e-01, -3.230e-02), r1);
	r2 = MulAdd(s1_1_2, M4(3.126e-01, 7.491e-03, -9.629e-01, -1.333e-01, -2.116e-01, -8.586e-03, -1.971e-01, 5.712e-02, -1.370e-02, -5.309e-03, -2.946e-01, 2.660e-02, -1.323e-01, 1.245e-01, -3.645e-01, -3.674e-02), r2);
	r0 = MulAdd(s1_2_0, M4(7.670e-02, 8.595e-03, 3.244e-02, -1.346e-02, 2.134e-02, 1.548e-01, 4.336e-03, 5.201e-03, 1.897e-02, -4.983e-01, 1.004e-01, -1.033e-01, -7.343e-02, 2.295e-02, 9.595e-02, -7.352e-02), r0);
	r1 = MulAdd(s1_2_0, M4(-4.434e-02, 4.009e-02, 8.151e-02, 3.921e-02, 1.460e-01, 7.789e-02, -3.046e-02, 1.184e-02, -5.098e-01, 1.579e-01, 3.040e-01, 1.067e-03, 2.557e-01, -3.645e-01, -1.889e-01, -7.528e-03), r1);
	r2 = MulAdd(s1_2_0, M4(-4.813e-01, -3.520e-02, -8.284e-02, 1.183e-01, -2.144e-02, -1.212e-02, 3.596e-02, -6.149e-02, -1.208e-01, 1.994e-02, 2.183e-01, 3.601e-01, -7.420e-01, -2.703e-03, -2.526e-01, -1.911e-01), r2);
	r0 = MulAdd(s1_2_1, M4(1.636e-01, -4.141e-02, -1.152e-01, -3.378e-02, 3.631e-02, 1.095e-01, 1.823e-02, -1.236e-02, -7.966e-02, -4.403e-01, 1.980e-01, 4.628e-02, 7.260e-02, 2.845e-02, -1.146e-01, 3.730e-02), r0);
	r1 = MulAdd(s1_2_1, M4(-2.549e-02, -1.408e-01, 1.984e-01, 3.723e-02, 1.239e-01, 1.850e-03, 2.271e-02, -8.864e-03, 2.747e-02, 4.286e-02, 3.197e-01, -3.832e-02, -4.773e-02, 2.914e-01, 1.265e-01, -1.169e-02), r1);
	r2 = MulAdd(s1_2_1, M4(-1.469e-01, 2.410e-02, -6.237e-02, 3.077e-01, -6.164e-01, -4.866e-04, -1.044e-01, -8.553e-02, -2.641e-01, 1.375e-02, 6.543e-01, 1.125e-01, 5.215e-01, 7.470e-04, 6.003e-02, 4.529e-02), r2);
	r0 = MulAdd(s1_2_2, M4(9.014e-02, -6.008e-02, -5.005e-02, 1.726e-02, 4.845e-02, 1.904e-02, -2.193e-02, 8.074e-03, -1.087e-01, -7.837e-02, 3.213e-02, 2.108e-02, -1.358e-01, -6.511e-02, 3.273e-02, 6.567e-02), r0);
	r1 = MulAdd(s1_2_2, M4(1.559e-01, -2.527e-02, 1.604e-01, 1.789e-02, 9.444e-02, 1.125e-04, 1.713e-01, 1.110e-03, -6.420e-03, -4.359e-02, -1.523e-01, 1.942e-02, -1.115e-01, -3.417e-03, 7.279e-02, -6.073e-02), r1);
	r2 = MulAdd(s1_2_2, M4(2.469e-01, 2.840e-02, 7.367e-02, 1.362e-01, -2.410e-01, -8.207e-04, -3.442e-02, 1.836e-02, 1.243e-01, 1.074e-02, 1.199e-02, -3.521e-02, -1.363e-02, -3.894e-02, 1.867e-01, -1.963e-01), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(9.631e-02, -2.804e-02, -9.453e-03, 1.323e-01, -3.873e-02, -1.756e-02, -5.242e-03, 7.056e-02, 2.419e-01, 9.249e-02, 5.779e-03, -9.669e-03, -2.893e-01, -4.675e-02, 8.553e-03, -1.525e-01), r0);
	r1 = MulAdd(s0_0_0, M4(-2.398e-02, -1.289e-01, 4.694e-03, -2.516e-01, 1.582e-01, -1.565e-01, 6.696e-02, -6.893e-02, -5.530e+00, 2.511e-01, -5.747e-02, -7.958e-02, 1.228e-01, -8.098e-04, 7.861e-03, 4.258e-02), r1);
	r2 = MulAdd(s0_0_0, M4(2.988e-02, 9.818e-03, 7.718e-02, -1.235e-01, 1.831e-03, -1.122e-02, 7.387e-03, -3.717e-02, -2.235e-03, 1.173e-02, 1.812e-01, 1.277e-01, 3.759e-02, 5.075e-02, -9.082e-02, 2.051e-01), r2);
	r0 = MulAdd(s0_0_1, M4(2.417e-01, 5.696e-03, -2.523e-02, 3.335e-02, 2.994e-01, -4.373e-02, -1.829e-02, 2.144e-01, -2.051e+00, -2.904e+00, -1.204e-03, -2.725e-01, -5.958e-01, 1.640e-01, 2.718e-02, -2.490e-02), r0);
	r1 = MulAdd(s0_0_1, M4(1.050e-01, -7.589e-02, -7.257e-02, -2.665e-01, -4.346e-02, -2.837e-02, 3.254e-02, -2.350e-01, 8.236e-01, -3.002e-02, -4.036e-01, 1.187e-01, -3.916e-01, -6.106e-03, -2.168e-01, 2.283e-01), r1);
	r2 = MulAdd(s0_0_1, M4(-4.683e-02, -4.553e-02, -8.636e-02, -1.352e-01, 9.251e-02, 1.189e-01, 3.465e-02, 6.706e-03, -3.065e-02, 1.250e-01, 1.267e-01, -1.171e-01, -1.415e-01, -1.284e-01, 2.027e-01, 2.410e-01), r2);
	r0 = MulAdd(s0_0_2, M4(1.786e-01, 3.389e-02, 1.466e-02, -4.641e-03, -8.848e-01, 3.649e-02, 6.161e-02, 4.990e-02, 3.555e-01, -2.781e-01, 1.542e-01, 3.356e-02, 2.407e-01, 6.276e-02, -3.615e-02, -4.808e-02), r0);
	r1 = MulAdd(s0_0_2, M4(1.938e-02, 3.655e-02, 3.674e-02, 1.314e-02, -1.608e-02, -1.874e-01, -1.124e+00, 1.665e-01, 5.260e-02, -1.156e-02, -1.678e+00, -1.620e-01, -1.349e-01, 7.761e-02, 1.586e-01, -7.672e-02), r1);
	r2 = MulAdd(s0_0_2, M4(-1.392e-02, 2.595e-02, -1.714e-01, -6.997e-02, 3.154e-01, 8.306e-02, -1.322e-01, -6.563e-02, -4.973e-01, -5.676e-02, -2.141e+00, 2.090e-01, 1.009e-02, -2.920e-03, 2.474e-01, -6.145e-02), r2);
	r0 = MulAdd(s0_1_0, M4(-1.929e-01, -2.588e-01, 3.779e-02, -8.635e-03, -3.529e-02, 5.598e-02, 1.119e-02, -6.296e-03, 1.195e-01, -3.284e-02, -1.909e-03, 9.286e-02, -4.257e-01, 2.673e-02, 5.518e-02, 7.681e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-3.640e+00, 1.669e-02, -4.130e-01, -1.931e-01, -9.340e-01, -8.276e-02, 4.574e-02, -1.106e-03, -9.860e-02, 1.926e-01, -4.612e-02, -5.462e-02, 1.960e+00, -9.520e-02, -1.051e-02, 1.545e-02), r1);
	r2 = MulAdd(s0_1_0, M4(-2.489e-02, 1.783e-01, 1.417e-01, 2.396e-01, 4.945e-02, 2.029e-02, 6.159e-02, 2.332e-02, -1.509e-01, 6.395e-03, -9.998e-02, 1.362e-01, -1.161e-01, -1.626e-01, -1.393e-01, -2.494e-01), r2);
	r0 = MulAdd(s0_1_1, M4(-3.274e+00, 2.682e-01, 1.102e-01, 7.051e-01, -1.348e+00, -1.402e-01, 1.560e-01, -5.292e-02, 2.280e-02, 2.007e-01, -8.571e-02, 1.169e-01, 8.401e-01, -1.686e-01, -3.213e-01, -2.289e-02), r0);
	r1 = MulAdd(s0_1_1, M4(3.720e-01, 2.042e-01, -3.725e-01, 8.848e-01, 1.794e-01, 1.152e-01, -4.944e-01, 1.010e-01, 2.857e-01, -2.353e-01, 3.256e-01, 5.700e-02, -2.873e-01, 6.139e-02, 2.183e-01, -6.211e-02), r1);
	r2 = MulAdd(s0_1_1, M4(-1.133e-02, 3.535e-01, 2.165e-01, -1.524e-01, 3.282e-02, 2.929e-04, 5.665e-02, 1.211e-01, -7.735e-02, 1.962e-01, 1.042e-01, 1.354e-01, 4.102e-01, 1.517e-01, -1.018e-01, 1.930e-01), r2);
	r0 = MulAdd(s0_1_2, M4(-1.035e+00, 1.804e-03, -1.218e-01, -1.179e-01, 2.256e-01, -9.327e-02, 5.833e-02, -2.050e-02, 2.045e-01, 2.872e-01, -4.038e-02, 5.665e-02, -1.688e-01, 1.628e-01, 1.684e-01, 4.537e-03), r0);
	r1 = MulAdd(s0_1_2, M4(-2.073e-01, 3.481e-02, 2.529e-01, 2.268e-01, -6.458e-02, -6.413e-02, 2.447e-01, 4.451e-02, 2.847e-01, -1.979e-01, -1.034e-01, -2.314e-02, -1.298e-01, -4.288e-02, -1.059e-01, -5.178e-02), r1);
	r2 = MulAdd(s0_1_2, M4(1.814e-01, -5.550e-02, -1.207e-01, 4.091e-02, -8.628e-02, -1.895e-02, -1.010e-01, -2.984e-02, 4.241e-01, 3.750e-02, -4.385e-01, -6.667e-02, -2.939e-01, -2.314e-02, 4.947e-01, -7.935e-02), r2);
	r0 = MulAdd(s0_2_0, M4(-1.120e-01, 3.196e-01, 5.164e-02, 1.143e-01, -3.380e-02, 2.722e-02, 3.650e-02, -2.876e-02, 6.177e-02, -1.992e-02, -1.522e-02, 3.466e-03, -3.762e-02, -1.678e-01, -1.084e-01, 2.907e-02), r0);
	r1 = MulAdd(s0_2_0, M4(8.000e-02, 1.485e-02, -2.207e-01, -4.029e-02, 2.246e-02, -1.914e-02, 8.531e-02, 1.385e-03, -7.882e-02, 8.078e-02, -4.635e-02, 2.131e-02, 1.998e-01, 8.228e-03, -2.912e-01, 9.723e-03), r1);
	r2 = MulAdd(s0_2_0, M4(1.966e-01, 2.201e-02, -1.946e-01, 4.151e-01, 1.084e-01, 1.185e-02, 7.331e-02, 2.434e-02, -1.656e-01, -6.117e-04, -7.607e-02, 5.965e-03, 5.628e-02, 1.161e-02, -1.454e-02, -1.642e-01), r2);
	r0 = MulAdd(s0_2_1, M4(3.607e-01, -2.947e-01, -1.931e-01, -4.023e-01, -2.717e-02, -1.100e-02, 1.147e-02, 1.111e-02, 5.457e-02, -4.799e-02, 4.258e-02, 1.010e-02, -8.689e-02, 1.692e-01, 1.298e-01, -4.596e-02), r0);
	r1 = MulAdd(s0_2_1, M4(1.801e-01, -3.279e-01, -2.748e+00, 1.461e-01, 9.752e-02, 2.452e-02, -1.179e-02, -2.788e-02, -3.933e-02, -2.861e-02, 1.512e-01, 7.932e-02, -4.109e-01, 1.843e-02, -1.122e-02, -3.969e-02), r1);
	r2 = MulAdd(s0_2_1, M4(-1.877e-02, 1.564e-01, 4.200e-01, 6.425e-01, 3.375e-01, 1.257e-02, 2.326e-02, 6.901e-03, -4.014e-01, 1.172e-02, 2.373e-02, 5.253e-02, 7.132e-02, -9.038e-02, -5.055e-03, -2.764e-01), r2);
	r0 = MulAdd(s0_2_2, M4(1.104e-02, -1.233e-01, 4.924e-02, 2.317e-02, -8.739e-02, 1.306e-02, -1.099e-02, -6.071e-04, 3.868e-02, -4.412e-02, 3.103e-02, -1.446e-03, 5.179e-02, -5.954e-02, 1.015e-02, -1.811e-02), r0);
	r1 = MulAdd(s0_2_2, M4(2.233e-01, -1.226e-01, 4.990e-01, -4.403e-03, -5.349e-02, 2.541e-02, -3.337e-02, -2.889e-02, 1.171e-01, -3.813e-02, 1.284e-01, 1.807e-03, -3.329e-01, 9.298e-02, -1.529e-01, 5.895e-02), r1);
	r2 = MulAdd(s0_2_2, M4(-2.705e-01, 3.745e-02, 8.321e-02, 3.103e-01, 5.026e-03, 3.200e-03, 3.430e-02, -5.324e-02, -9.025e-02, -3.901e-03, 7.197e-02, -2.309e-02, 2.425e-01, 1.996e-02, -4.584e-02, 1.744e-03), r2);
	r0 = max(r0, 0.0);
	T3[gxy] = r0;
	r1 = max(r1, 0.0);
	T4[gxy] = r1;
	r2 = max(r2, 0.0);
	T5[gxy] = r2;
}

//!PASS 3
//!DESC conv2 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T3, T4, T5
//!OUT T0, T1, T2

#define L0(x, y) V4(O(T3, x, y))
#define L1(x, y) V4(O(T4, x, y))
#define L2(x, y) V4(O(T5, x, y))

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-6.759e-03, -1.505e-02, 2.356e-03, -2.135e-02, -9.848e-03, -9.010e-02, 5.188e-03, 1.574e-02, 1.274e-01, 3.135e-01, -2.161e-03, -1.236e-01, 7.479e-03, 9.270e-02, 3.689e-03, -5.273e-03), r0);
	r1 = MulAdd(s0_0_0, M4(-1.580e-02, -1.418e-02, 5.399e-02, -8.306e-02, 2.179e-02, 5.891e-02, 1.409e-01, -8.571e-02, 2.366e-01, 1.275e-01, -1.676e-01, -1.280e-01, 1.564e-02, 2.385e-02, -4.874e-02, 1.494e-01), r1);
	r2 = MulAdd(s0_0_0, M4(-7.715e-03, -1.194e-02, -2.538e-03, 4.241e-02, -6.301e-03, 2.680e-02, -2.478e-02, 8.129e-02, -1.247e-01, 1.184e-02, -4.004e-02, -1.638e-01, -2.347e-04, 4.530e-02, -3.080e-02, -3.733e-02), r2);
	r0 = MulAdd(s0_0_1, M4(9.757e-03, -3.255e-02, 1.727e-02, 2.885e-02, -7.986e-02, 2.617e-01, 2.018e-02, 5.360e-02, -2.950e-01, -7.824e-03, 1.238e-01, -1.384e-01, -6.957e-02, -2.548e-02, -1.444e-03, 3.309e-02), r0);
	r1 = MulAdd(s0_0_1, M4(-1.924e-02, 8.881e-02, 9.455e-02, 2.853e-02, -2.841e-02, -7.529e-02, 2.778e-01, 1.516e-01, 3.797e-02, 2.660e-01, -3.899e-01, 6.196e-02, -1.187e-02, 1.479e-01, -5.362e-02, 2.011e-01), r1);
	r2 = MulAdd(s0_0_1, M4(9.283e-03, 1.542e-02, 2.879e-02, -8.667e-02, 4.427e-02, 7.304e-02, -1.860e-02, -2.831e-02, 3.491e-01, 1.712e-01, 2.666e-01, 2.057e-01, 6.653e-02, 7.203e-02, -5.034e-02, -5.006e-02), r2);
	r0 = MulAdd(s0_0_2, M4(-5.881e-03, 3.936e-02, -3.432e-03, 8.615e-02, 1.244e-02, 9.838e-02, 7.745e-03, 2.848e-02, -5.667e-02, 7.611e-03, 3.827e-03, 1.540e-01, -9.220e-03, 6.165e-02, -2.547e-02, 1.077e-01), r0);
	r1 = MulAdd(s0_0_2, M4(5.461e-04, -2.705e-02, 2.154e-01, 5.772e-02, -4.514e-02, -2.848e-02, 2.481e-01, 1.294e-01, -5.020e-02, -1.949e-02, 7.258e-02, 2.232e-01, 3.700e-02, -2.129e-02, 2.838e-02, 2.368e-01), r1);
	r2 = MulAdd(s0_0_2, M4(1.580e-02, -7.514e-03, -8.639e-03, 3.299e-02, 5.892e-02, 1.130e-01, -8.888e-02, 6.520e-02, 9.644e-02, -1.923e-02, -1.241e-01, 1.050e-01, 6.810e-02, 4.524e-02, -8.995e-02, -2.264e-02), r2);
	r0 = MulAdd(s0_1_0, M4(-3.921e-03, -1.237e-01, -1.254e-02, -2.860e-02, -7.210e-03, 1.811e-01, 1.227e-02, -1.525e-02, 9.198e-02, 2.154e-01, -1.949e-01, -3.529e-02, 5.385e-02, 3.447e-01, -2.644e-02, -3.419e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-6.401e-02, -1.463e-01, -2.095e-01, 4.736e-02, -6.283e-02, 5.619e-02, 6.241e-02, -1.372e-01, -1.676e-01, -8.025e-02, 1.257e-02, -2.190e-01, 4.044e-03, 2.763e-02, 1.439e-02, 1.586e-01), r1);
	r2 = MulAdd(s0_1_0, M4(3.096e-03, 1.511e-02, 1.140e-01, -5.856e-02, 1.590e-02, 6.986e-02, -1.801e-02, 1.401e-01, -8.360e-02, -2.062e-01, -1.107e-01, -8.202e-02, 1.340e-02, -5.605e-02, -1.187e-01, -1.065e-01), r2);
	r0 = MulAdd(s0_1_1, M4(-1.194e-01, -1.284e-01, 4.426e-02, -9.848e-02, -9.053e-02, 1.209e-01, 1.412e-01, -4.932e-02, 4.005e-01, 3.335e-01, -2.358e-01, 2.143e-01, -2.045e-01, -5.232e-01, 6.471e-02, 4.358e-02), r0);
	r1 = MulAdd(s0_1_1, M4(6.280e-02, 2.705e-01, -9.820e-01, -1.780e+00, 1.233e-01, 2.027e-01, 6.027e-01, -3.098e-02, 4.446e-01, 2.518e-02, 1.602e-01, 4.224e-01, -2.054e-01, 1.402e-01, -3.910e-01, -3.486e-01), r1);
	r2 = MulAdd(s0_1_1, M4(-1.433e-02, -1.651e-01, 4.444e-01, -2.871e-01, 2.485e-01, 2.781e-01, -1.700e-01, 2.833e-01, -3.115e-01, -7.890e-02, -1.859e-01, 3.705e-01, 1.939e-01, -2.440e-01, -4.381e-03, 2.825e-01), r2);
	r0 = MulAdd(s0_1_2, M4(-3.257e-02, 6.124e-02, -1.168e-03, 6.272e-02, -1.960e-01, 2.225e-01, 1.492e-02, 2.386e-01, 4.710e-02, 2.351e-02, 4.334e-02, -3.379e-02, 2.085e-01, -9.796e-02, 5.211e-02, 3.489e-01), r0);
	r1 = MulAdd(s0_1_2, M4(-2.535e-02, -6.549e-02, 2.373e-01, 8.015e-02, -2.104e-01, -4.005e-01, -1.050e-01, 3.508e-02, -6.374e-02, -2.266e-01, 4.773e-02, 1.759e-01, -4.369e-02, 1.094e-01, -7.176e-02, -1.012e-01), r1);
	r2 = MulAdd(s0_1_2, M4(2.073e-02, -1.313e-01, 9.050e-02, 2.890e-02, 7.562e-02, 7.598e-02, 1.232e-01, 2.819e-01, -1.389e-04, 2.778e-02, 8.439e-02, 3.335e-02, -2.143e-01, -3.591e-01, 5.573e-02, -9.345e-02), r2);
	r0 = MulAdd(s0_2_0, M4(1.283e-02, -1.762e-02, -2.889e-03, 1.775e-02, -3.492e-02, 7.217e-02, -3.527e-02, 8.411e-03, 2.036e-02, -9.090e-02, -5.922e-03, -1.808e-02, 2.552e-02, -1.725e-01, 1.701e-02, -2.806e-02), r0);
	r1 = MulAdd(s0_2_0, M4(2.996e-02, 1.882e-01, 1.805e-02, 2.959e-01, -1.414e-02, 4.523e-02, -8.140e-02, -7.306e-02, -1.225e-03, 8.143e-02, -1.101e-01, 9.234e-02, 1.270e-01, 1.632e-01, 1.665e-01, 1.126e-01), r1);
	r2 = MulAdd(s0_2_0, M4(-3.544e-02, -4.571e-02, 5.553e-02, 8.032e-02, 1.984e-02, 9.358e-02, -3.331e-02, 1.835e-02, -9.506e-03, -2.274e-02, -5.319e-02, -4.837e-02, -1.702e-02, -2.493e-02, 4.273e-02, -3.310e-02), r2);
	r0 = MulAdd(s0_2_1, M4(-4.770e-02, 1.123e-01, -2.839e-02, 8.945e-04, -6.619e-02, 9.348e-02, -3.933e-03, 9.205e-02, -4.442e-03, -8.051e-02, 2.174e-02, 1.275e-02, -1.422e-01, -1.556e-01, 2.646e-01, -6.622e-02), r0);
	r1 = MulAdd(s0_2_1, M4(3.653e-02, 9.451e-02, -2.475e-01, 2.187e-01, -3.029e-02, -2.087e-01, 2.920e-01, -4.719e-01, -5.179e-02, 3.824e-02, 7.301e-02, -1.555e-01, -2.059e-01, -3.804e-01, -5.335e-01, -3.232e-01), r1);
	r2 = MulAdd(s0_2_1, M4(-3.173e-02, -1.196e-01, 2.885e-01, 4.242e-02, 8.618e-02, 1.354e-01, -5.229e-02, 5.659e-03, 5.497e-02, 1.137e-01, -1.064e-01, -1.654e-02, 2.501e-01, 1.548e-01, -2.891e-01, -4.272e-02), r2);
	r0 = MulAdd(s0_2_2, M4(1.063e-02, 3.782e-02, -2.144e-02, 1.326e-01, -5.911e-02, 6.685e-03, 1.411e-02, 1.441e-01, 1.583e-03, 5.785e-02, -1.711e-02, -1.195e-02, -1.130e-01, -1.257e-01, -1.891e-02, -7.505e-02), r0);
	r1 = MulAdd(s0_2_2, M4(-5.770e-03, -1.543e-01, 7.640e-02, 5.218e-02, -2.431e-02, -8.428e-02, 6.076e-02, -3.515e-01, 5.914e-03, -8.318e-02, 6.820e-02, -3.257e-02, -6.732e-02, 3.447e-01, -2.948e-01, -6.083e-02), r1);
	r2 = MulAdd(s0_2_2, M4(-1.643e-02, 9.018e-02, 9.565e-02, -9.905e-03, -2.777e-03, -4.201e-02, 1.910e-01, 2.960e-03, -1.180e-02, -4.284e-02, 4.498e-02, 8.336e-02, 9.301e-02, 2.375e-02, -9.536e-02, -1.039e-01), r2);
	r0 = MulAdd(s1_0_0, M4(-3.232e-02, -1.392e-01, -2.154e-02, 8.684e-03, 1.473e-02, 1.880e-01, -1.315e-02, 3.371e-02, -2.070e-02, 3.211e-02, -2.351e-02, 2.569e-02, -5.806e-02, 1.929e-01, 1.368e-03, -2.081e-02), r0);
	r1 = MulAdd(s1_0_0, M4(-1.381e-02, 2.206e-02, 2.714e-01, -9.864e-02, -6.151e-02, -7.034e-02, 8.710e-02, 1.022e-02, 1.203e-02, -1.267e-02, -8.538e-02, -1.284e-01, 3.371e-02, 4.512e-02, -2.956e-01, 7.293e-02), r1);
	r2 = MulAdd(s1_0_0, M4(3.255e-03, -2.771e-03, -5.561e-02, 1.824e-02, 2.894e-02, 8.127e-03, -6.393e-02, 3.503e-02, 1.490e-03, 1.931e-02, -1.421e-02, 2.400e-02, 3.499e-02, 7.051e-02, 4.356e-02, 1.291e-02), r2);
	r0 = MulAdd(s1_0_1, M4(-8.468e-02, 1.223e-01, 3.317e-03, 2.475e-02, -1.007e-01, 3.256e-01, -4.773e-02, -3.219e-02, 2.716e-02, -1.034e-01, -3.047e-02, -1.714e-02, -4.797e-02, 1.171e-01, -6.142e-04, 9.536e-03), r0);
	r1 = MulAdd(s1_0_1, M4(1.179e-01, 8.259e-02, 1.625e-01, 1.930e-01, 7.982e-02, 1.957e-01, 4.223e-01, 1.018e-01, 6.428e-02, 8.548e-03, -1.717e-01, -2.778e-01, 6.160e-04, 5.557e-02, -8.745e-01, -8.361e-02), r1);
	r2 = MulAdd(s1_0_1, M4(1.154e-02, -2.620e-02, 1.522e-02, 3.887e-02, 1.342e-02, 8.653e-02, 2.729e-02, -8.847e-03, -1.207e-01, -9.943e-02, 8.319e-03, 6.700e-02, -2.150e-02, 8.855e-02, -3.030e-02, 5.261e-02), r2);
	r0 = MulAdd(s1_0_2, M4(-1.749e-02, 9.230e-02, 1.213e-02, -1.196e-01, -3.212e-02, 4.104e-02, 4.388e-02, 9.445e-02, 1.447e-02, -6.141e-02, -1.582e-02, -1.723e-02, -4.415e-02, 1.271e-04, 6.426e-03, -3.212e-02), r0);
	r1 = MulAdd(s1_0_2, M4(2.438e-02, -3.891e-03, 2.686e-01, -7.350e-02, 5.187e-02, -3.246e-02, 9.097e-02, -4.757e-02, 3.653e-02, -1.538e-02, 4.051e-02, -1.390e-01, 8.494e-02, 2.973e-02, -7.575e-01, -1.007e-01), r1);
	r2 = MulAdd(s1_0_2, M4(5.311e-03, 1.382e-01, 2.313e-02, 2.166e-02, 3.273e-02, 1.842e-01, 3.200e-02, 6.792e-02, -5.007e-02, -2.395e-02, 1.106e-03, -1.757e-02, 2.165e-02, 1.202e-02, 3.874e-02, 6.710e-02), r2);
	r0 = MulAdd(s1_1_0, M4(-5.297e-02, -7.059e-02, 3.115e-03, -2.544e-02, 5.847e-02, 4.033e-01, 5.774e-02, 2.709e-02, 3.724e-03, -2.364e-01, -5.125e-02, -8.305e-03, -1.323e-01, -7.102e-02, -9.253e-03, -8.608e-03), r0);
	r1 = MulAdd(s1_1_0, M4(-5.709e-02, -8.277e-02, 1.067e-01, -2.049e-01, 7.058e-02, 3.285e-02, -1.666e-01, 6.238e-02, -8.775e-02, -1.156e-01, -1.992e-01, -1.556e-01, -1.014e-01, 1.802e-01, 2.314e-01, 4.443e-02), r1);
	r2 = MulAdd(s1_1_0, M4(2.843e-02, -2.882e-02, -6.463e-02, 5.271e-02, -4.349e-02, 3.837e-02, 3.964e-02, -1.500e-01, -1.342e-02, -8.717e-02, 6.817e-02, 3.045e-02, 3.529e-02, -2.678e-02, -8.400e-02, 1.324e-01), r2);
	r0 = MulAdd(s1_1_1, M4(-9.799e-02, 2.398e-01, 7.693e-02, -2.964e-02, -1.767e-01, 3.493e-01, -1.063e-01, -7.992e-02, 2.932e-02, -3.393e-01, -4.814e-02, -1.720e-01, -1.586e-01, 5.379e-01, -7.118e-02, 4.628e-02), r0);
	r1 = MulAdd(s1_1_1, M4(1.841e-01, 4.493e-01, -1.644e-01, 3.375e-01, -2.744e-01, 1.349e-01, -3.830e-01, -4.070e-01, -3.018e-01, 1.444e-01, -2.143e-01, -7.086e-03, -2.674e-02, -7.800e-02, 2.016e-01, 1.616e-01), r1);
	r2 = MulAdd(s1_1_1, M4(-3.966e-02, 1.086e-01, 5.865e-02, -1.738e-01, -4.519e-02, -2.647e-01, 1.684e-01, 1.420e-01, -1.421e-01, -2.744e-01, 8.009e-03, -3.473e-01, 5.831e-02, 2.071e-01, -2.494e-01, 2.414e-02), r2);
	r0 = MulAdd(s1_1_2, M4(-6.101e-02, -2.213e-01, 9.060e-02, -7.942e+00, -1.255e-01, -9.981e-02, -4.617e-02, -3.347e-01, 5.673e-02, -3.055e-02, -2.132e-02, -5.782e-01, -5.440e-02, 1.824e-01, 2.427e-02, -2.083e-02), r0);
	r1 = MulAdd(s1_1_2, M4(3.273e-02, 1.237e-01, -6.635e-01, -5.795e-01, -1.405e-01, -5.086e-02, -3.688e-01, -1.909e-01, 2.969e-02, 2.998e-02, -5.395e-02, -1.955e-01, 1.254e-01, 1.295e-01, 9.138e-02, 2.108e-01), r1);
	r2 = MulAdd(s1_1_2, M4(-4.120e-02, -3.115e-01, 3.345e-01, -1.030e-01, 9.356e-02, 2.701e-01, 1.988e-01, -1.423e-01, -5.601e-02, -9.980e-03, -2.940e-02, -1.493e-01, -1.731e-02, -1.484e-02, -5.268e-03, 2.069e-01), r2);
	r0 = MulAdd(s1_2_0, M4(-9.784e-02, 1.578e-01, 4.306e-02, 2.858e-03, -1.519e-02, -5.994e-02, -3.936e-03, -4.726e-02, -4.825e-02, -1.958e-01, -1.119e-02, 2.527e-02, -1.377e-01, 3.506e-01, -3.783e-03, 3.177e-02), r0);
	r1 = MulAdd(s1_2_0, M4(1.658e-02, 1.978e-01, 5.051e-02, 1.294e-01, -1.068e-01, 1.227e-01, -1.411e-01, -6.666e-02, -7.549e-02, -3.135e-02, -1.060e-01, -4.586e-02, -5.916e-02, -1.474e-01, 3.264e-02, -2.533e-01), r1);
	r2 = MulAdd(s1_2_0, M4(1.645e-02, -2.914e-03, -4.603e-02, 6.803e-02, 2.154e-02, 2.931e-02, -1.135e-01, -9.852e-03, 1.574e-02, -3.317e-02, 1.650e-02, 3.483e-02, -7.826e-03, 4.164e-02, 8.609e-02, 7.429e-02), r2);
	r0 = MulAdd(s1_2_1, M4(-3.542e-02, -1.539e-01, -8.780e-02, 6.522e-02, -1.921e-01, -2.548e-01, -4.639e-03, 2.511e-02, -5.201e-02, -3.517e-02, -2.826e-02, -2.817e-02, 7.887e-02, 1.685e-01, 2.006e-01, -5.049e-02), r0);
	r1 = MulAdd(s1_2_1, M4(8.619e-03, 2.244e-01, 1.567e-01, 1.408e-01, -5.459e-02, 1.096e-01, 2.114e-01, -4.726e-01, -8.311e-02, 2.259e-01, -4.627e-02, -2.793e-02, 1.261e-01, 7.417e-02, -7.290e-02, 2.032e-02), r1);
	r2 = MulAdd(s1_2_1, M4(3.764e-03, 9.671e-02, 1.274e-01, 3.466e-02, 9.256e-02, 5.622e-02, -5.897e-02, 2.367e-02, -1.060e-02, -1.475e-02, -2.870e-02, -6.589e-02, -1.787e-01, -1.109e-01, 2.158e-01, 2.742e-03), r2);
	r0 = MulAdd(s1_2_2, M4(-2.640e-02, 1.261e-01, 4.421e-02, -1.221e-01, -9.794e-02, 3.996e-01, -1.184e-01, -2.059e-02, -1.182e-03, -4.215e-03, 4.078e-03, 9.946e-03, 8.333e-03, -1.370e-02, 1.337e-02, 1.006e-01), r0);
	r1 = MulAdd(s1_2_2, M4(6.639e-02, -2.302e-02, -2.505e-01, 2.445e-01, 1.800e-02, -4.631e-03, -1.594e-01, -5.270e-02, -1.248e-03, 1.477e-02, 1.812e-02, -5.717e-02, -5.804e-02, -1.394e-01, 8.077e-02, 2.228e-02), r1);
	r2 = MulAdd(s1_2_2, M4(-4.187e-02, -8.189e-02, 9.144e-02, 1.709e-01, 5.584e-03, 7.161e-02, 2.024e-01, -4.796e-02, -1.520e-02, 4.362e-02, -3.365e-03, 4.276e-03, -9.154e-02, 9.219e-03, -4.412e-02, 1.948e-03), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-2.768e-02, -3.607e-01, -3.826e-03, 3.327e-02, -1.740e-02, -2.238e-01, 3.163e-02, -1.567e-02, 4.057e-02, -2.185e-01, 1.973e-02, 2.995e-02, -8.520e-03, -2.660e-01, 1.151e-02, 6.180e-02), r0);
	r1 = MulAdd(s0_0_0, M4(8.276e-02, 7.184e-02, 1.983e-01, -6.656e-02, -3.404e-02, 3.801e-02, 8.843e-02, -1.287e-01, 8.911e-02, 1.120e-01, 1.430e-01, 1.015e-01, -7.758e-03, 1.002e-02, 5.329e-02, -1.200e-01), r1);
	r2 = MulAdd(s0_0_0, M4(6.094e-02, -1.348e-01, 9.211e-02, 2.328e-01, -3.509e-03, -3.889e-02, 3.535e-02, 9.397e-03, 6.235e-02, -9.936e-02, 7.862e-02, -3.623e-02, -7.822e-02, 8.180e-03, 3.507e-02, -2.923e-02), r2);
	r0 = MulAdd(s0_0_1, M4(-6.557e-02, -3.243e-01, 4.582e-02, -7.827e-02, 2.307e-01, -1.248e-01, 1.853e-02, -1.030e-02, -5.730e-02, -2.801e-01, -3.359e-02, 1.173e-01, 9.277e-02, 1.577e-01, -8.326e-02, 1.685e-02), r0);
	r1 = MulAdd(s0_0_1, M4(1.646e-01, 2.901e-01, -3.359e-01, -6.320e-02, -4.730e-02, -3.252e-01, -2.061e-01, -1.401e-01, 4.871e-02, 2.347e-02, 1.797e-01, 7.995e-02, -1.470e-01, -4.847e-02, 4.809e-01, 5.727e-02), r1);
	r2 = MulAdd(s0_0_1, M4(-6.238e-02, -1.443e-01, 1.845e-01, -5.890e-02, -7.351e-02, -1.213e-01, -1.165e-01, 4.939e-02, -3.111e-03, -1.233e-01, 1.715e-01, 5.261e-02, 1.076e-01, 5.183e-02, 6.130e-03, -5.612e-02), r2);
	r0 = MulAdd(s0_0_2, M4(4.993e-02, -6.425e-02, -9.127e-04, 1.079e-01, 4.481e-03, -1.210e-01, 1.247e-02, -2.445e-01, 1.774e-02, -2.764e-01, 5.006e-03, -1.142e-01, -2.644e-02, 2.134e-01, 1.738e-02, -8.312e-02), r0);
	r1 = MulAdd(s0_0_2, M4(9.025e-02, -2.346e-02, 9.599e-02, 2.617e-01, -9.498e-02, 4.933e-02, 4.825e-02, -1.160e-01, -2.593e-03, -4.598e-02, -1.515e-03, -4.995e-02, 5.967e-02, -1.864e-02, 1.350e-01, -6.435e-02), r1);
	r2 = MulAdd(s0_0_2, M4(6.188e-02, -8.339e-02, 7.458e-02, -1.726e-01, -6.567e-02, -3.636e-02, 1.179e-01, -4.469e-02, -1.245e-02, 4.836e-03, 1.244e-02, -5.965e-02, -7.728e-04, -8.893e-02, -1.136e-01, -7.585e-02), r2);
	r0 = MulAdd(s0_1_0, M4(-1.431e-01, -8.856e-01, -1.800e-02, -3.134e-02, -7.293e-02, -2.156e-01, 1.660e-03, 2.090e-02, 2.647e-01, 7.340e-03, 1.372e-01, 1.324e-01, 4.186e-02, 8.781e-02, 2.679e-02, 6.710e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-2.023e-02, -5.685e-01, -2.805e-01, 1.028e-01, 3.799e-02, 3.429e-02, 3.215e-02, -3.480e-02, 4.155e-01, 4.517e-01, 3.052e-01, 4.466e-01, -8.590e-02, -5.328e-02, 1.420e-01, -1.434e-01), r1);
	r2 = MulAdd(s0_1_0, M4(5.193e-02, 2.826e-02, 1.665e-01, -2.806e-02, 5.149e-02, 9.260e-02, -1.110e-01, 1.163e-01, -8.487e-02, -7.128e-02, 2.979e-01, -1.580e-01, 4.798e-02, 2.079e-02, -7.733e-02, 3.354e-02), r2);
	r0 = MulAdd(s0_1_1, M4(1.047e-01, -5.962e-01, 5.820e-02, -7.738e-02, 5.410e-01, 3.181e-02, -1.573e-02, 7.027e-02, 5.748e-02, -1.923e-01, 7.301e-02, 1.479e-01, 1.306e-01, -4.845e-02, 2.518e-01, -3.623e-01), r0);
	r1 = MulAdd(s0_1_1, M4(6.321e-02, 3.761e-02, 2.010e-01, -1.364e-01, 3.390e-01, -1.184e-01, -3.796e-01, 8.079e-01, -6.100e-02, -3.467e-01, 2.407e-01, 6.632e-01, 1.479e-01, -1.663e-01, -3.796e-01, 6.408e-02), r1);
	r2 = MulAdd(s0_1_1, M4(1.922e-01, -1.099e-01, -1.372e-01, -2.865e-01, -2.607e-01, 2.608e-01, -3.294e-01, -4.561e-01, 2.533e-01, -1.393e-01, -1.968e-02, -2.260e-01, -9.535e-02, -2.904e-01, 4.539e-02, 1.962e-01), r2);
	r0 = MulAdd(s0_1_2, M4(4.202e-02, -3.441e-01, -5.041e-03, 6.145e-02, -3.193e-01, -8.410e-02, 1.436e-02, -8.656e-01, -7.106e-02, -2.674e-01, 5.246e-03, -1.115e-01, 2.656e-02, 7.308e-02, -3.231e-02, 3.213e-01), r0);
	r1 = MulAdd(s0_1_2, M4(8.666e-02, -1.030e-02, 1.127e-01, -1.118e-01, 5.768e-03, -9.184e-02, -3.923e-01, -1.075e-01, 5.027e-02, -3.824e-02, -9.206e-02, -2.105e-03, 1.086e-02, -1.059e-01, 1.138e-01, -3.379e-02), r1);
	r2 = MulAdd(s0_1_2, M4(-1.336e-02, -9.546e-02, 5.256e-02, -2.549e-01, 3.018e-01, 4.872e-01, 2.351e-01, -1.239e-01, 5.555e-02, 1.087e-01, 3.377e-02, -1.048e-01, 1.470e-02, 1.461e-01, 3.523e-02, 7.192e-04), r2);
	r0 = MulAdd(s0_2_0, M4(-5.178e-03, -1.025e-01, -2.115e-02, 6.963e-03, 5.296e-02, 2.358e-01, 8.896e-03, 2.767e-02, 2.588e-04, -3.482e-01, 5.441e-03, 6.959e-02, 3.456e-02, -6.226e-02, 2.074e-02, -2.594e-02), r0);
	r1 = MulAdd(s0_2_0, M4(7.487e-02, 1.926e-01, 1.372e-01, 1.103e-01, 8.974e-02, 1.338e-03, 3.621e-01, 1.107e-01, 1.989e-01, 4.757e-01, 1.753e-01, 2.757e-01, 2.485e-02, 3.196e-02, -1.901e-02, -1.550e-02), r1);
	r2 = MulAdd(s0_2_0, M4(-2.344e-02, -7.446e-02, 1.229e-01, 9.867e-03, -4.147e-02, 5.022e-03, 3.727e-02, -1.014e-02, -4.265e-02, -2.538e-02, 1.716e-01, -4.235e-02, -4.406e-02, -2.428e-03, 2.516e-02, 3.334e-02), r2);
	r0 = MulAdd(s0_2_1, M4(6.321e-02, -9.709e-02, -3.168e-02, 3.881e-02, 7.005e-02, -1.165e-01, 3.065e-01, 1.001e-01, -3.002e-02, -4.404e-01, 4.678e-02, 1.850e-01, -2.144e-02, 6.650e-02, -6.158e-02, -7.892e-02), r0);
	r1 = MulAdd(s0_2_1, M4(1.411e-01, 3.582e-02, 5.495e-02, 2.354e-01, -2.686e-01, -7.208e-01, 2.408e-01, -4.971e-01, 9.059e-02, -3.666e-02, 2.021e-01, 3.297e-01, -8.278e-02, -1.104e-01, 8.389e-03, -1.223e-01), r1);
	r2 = MulAdd(s0_2_1, M4(-4.799e-02, -1.045e-01, 5.626e-02, 7.385e-03, 1.380e-01, 1.744e-01, -1.752e-01, 1.615e-01, -1.701e-02, -2.029e-02, 1.421e-01, -9.601e-02, 6.468e-02, 2.879e-02, -4.902e-02, -8.367e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-2.225e-02, -1.595e-01, -4.593e-02, 4.723e-02, 8.864e-02, -1.375e-01, 5.149e-02, -8.715e-03, -3.479e-02, -6.187e-02, 8.630e-03, 9.938e-02, 1.695e-02, -6.475e-02, 1.606e-03, 4.977e-03), r0);
	r1 = MulAdd(s0_2_2, M4(2.498e-02, 6.952e-02, 2.386e-02, 1.137e-01, -1.556e-02, -2.305e-03, 1.508e-01, -1.112e-01, 1.222e-01, 5.966e-02, 7.608e-03, 6.850e-03, 2.194e-02, 7.293e-02, 4.140e-02, -1.049e-01), r1);
	r2 = MulAdd(s0_2_2, M4(-4.498e-02, 4.627e-02, 2.575e-02, -8.819e-02, 2.506e-02, -2.998e-01, 2.183e-01, 3.620e-02, 3.638e-03, 4.575e-02, 1.408e-01, -5.385e-02, -6.157e-03, -1.606e-02, -5.590e-02, 4.172e-03), r2);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
	r2 = max(r2, 0.0);
	T2[gxy] = r2;
}

//!PASS 4
//!DESC conv3 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T0, T1, T2
//!OUT T3, T4, T5

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))
#define L2(x, y) V4(O(T2, x, y))

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(1.107e-02, -7.586e-03, 1.055e-01, -1.436e-02, -1.937e-02, 1.235e-02, -1.291e-02, -4.964e-03, -4.749e-02, 8.916e-02, -5.278e-02, -4.107e-02, -1.112e-01, 5.605e-02, -1.426e-01, 5.483e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-1.175e-02, -6.420e-02, -1.851e-02, 2.728e-02, -3.537e-02, 1.209e-02, -3.933e-03, -5.783e-02, -6.833e-02, 6.540e-02, 4.688e-02, -2.901e-02, 1.201e-02, -1.721e-02, 6.334e-02, 8.846e-03), r1);
	r2 = MulAdd(s0_0_0, M4(-9.284e-02, -3.249e-02, -2.467e-02, 2.212e-02, -1.087e-02, -3.703e-02, 1.593e-03, -3.284e-03, 1.538e-01, -1.921e-02, 1.327e-02, -9.015e-02, -1.552e-03, 6.812e-02, 9.520e-02, -1.498e-02), r2);
	r0 = MulAdd(s0_0_1, M4(1.339e-01, 3.130e-02, 2.163e-01, -5.306e-03, -1.323e-02, -1.457e-01, 1.654e-02, -1.088e-01, -5.421e-02, -1.098e-02, -1.141e-02, 2.206e-02, 8.031e-02, -1.577e-01, 1.933e-02, 3.560e-02), r0);
	r1 = MulAdd(s0_0_1, M4(4.372e-02, -9.601e-02, -1.303e-02, 3.410e-02, 5.823e-02, -3.635e-03, -2.811e-02, -1.210e-02, 3.595e-02, 5.528e-01, 2.332e-01, -7.283e-02, 1.353e-01, 1.489e-01, -7.819e-02, 3.119e-02), r1);
	r2 = MulAdd(s0_0_1, M4(-1.645e-01, -1.023e-01, -7.464e-02, 7.838e-02, 1.174e-02, -1.126e-01, -7.492e-02, -1.587e-01, 6.317e-01, -8.793e-03, -1.082e-02, 1.250e-01, 6.958e-02, 7.400e-02, 2.525e-02, 1.143e-01), r2);
	r0 = MulAdd(s0_0_2, M4(8.408e-03, 2.266e-02, -6.888e-03, 8.242e-03, -7.029e-02, -2.028e-01, 2.035e-01, -4.202e-03, 3.821e-03, 1.425e-01, -2.011e-02, -7.842e-02, -2.531e-02, -7.025e-02, 7.411e-02, -1.875e-02), r0);
	r1 = MulAdd(s0_0_2, M4(8.579e-03, 6.018e-02, -3.286e-02, -1.822e-02, -8.032e-02, -1.984e-02, 2.641e-02, 2.281e-02, -6.372e-02, -2.290e-02, -8.482e-03, 2.069e-02, 3.086e-02, 2.962e-02, -4.787e-03, 3.542e-03), r1);
	r2 = MulAdd(s0_0_2, M4(-8.781e-03, 1.992e-02, -2.689e-02, 1.677e-03, 7.394e-02, -6.466e-02, 3.778e-02, 1.010e-01, 2.537e-02, -9.397e-02, 7.088e-02, -1.665e-01, 4.651e-03, 1.024e-02, 2.075e-02, -1.922e-02), r2);
	r0 = MulAdd(s0_1_0, M4(-2.070e-02, -1.197e-01, -1.990e-01, 1.231e-02, -6.372e-02, -6.034e-02, 2.702e-02, -1.255e-02, 6.392e-03, -9.508e-02, -6.023e-02, -4.258e-02, 1.364e-01, -5.254e-01, -4.302e-03, -2.192e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-3.606e-03, -5.297e-02, 6.144e-02, -1.684e-01, -8.938e-03, 5.772e-03, -1.082e-01, -6.488e-02, -1.899e-02, 5.163e-02, -1.401e-01, 1.037e-02, 1.125e-01, 1.096e-01, -1.148e-01, 9.304e-02), r1);
	r2 = MulAdd(s0_1_0, M4(-7.149e-02, -9.924e-03, 1.299e-01, 8.492e-03, 2.227e-02, -1.012e-01, -1.973e-02, -5.824e-03, -9.158e-02, -1.057e-01, -1.659e-01, -5.627e-02, -2.326e-02, -9.287e-02, -1.670e-01, 1.088e-01), r2);
	r0 = MulAdd(s0_1_1, M4(-1.518e-01, -2.740e-01, -6.935e-01, -1.679e-01, -1.432e-01, 2.338e-01, -8.023e-02, -1.140e-01, -1.375e-01, 2.449e-02, -1.096e-01, -2.695e-03, -2.931e-02, -1.313e-01, -2.825e-01, 7.026e-01), r0);
	r1 = MulAdd(s0_1_1, M4(-1.160e-01, -1.034e-02, -1.199e-01, 2.998e-01, 1.962e-02, -7.827e-02, 1.704e-02, 5.374e-02, -2.662e-01, -5.043e-02, 1.392e-01, -2.031e-01, 6.656e-01, 3.125e-01, -5.106e-02, 5.728e-03), r1);
	r2 = MulAdd(s0_1_1, M4(-1.480e-01, -1.646e-01, 3.587e-01, -2.959e-01, -1.096e-01, 2.173e-01, 1.086e-01, 2.301e-02, -2.055e-01, -1.023e-01, -3.057e-01, -3.379e-02, -3.048e-02, 9.778e-01, 4.615e-03, 6.197e-01), r2);
	r0 = MulAdd(s0_1_2, M4(2.588e-01, -3.288e-01, 9.400e-02, 7.456e-02, 2.543e-02, 4.895e-02, -2.193e-01, 6.066e-02, -1.181e-01, 2.286e-01, 6.175e-02, -7.397e-02, -7.838e-03, 1.137e-03, -1.181e-01, -6.617e-02), r0);
	r1 = MulAdd(s0_1_2, M4(1.842e-02, 8.958e-02, 1.316e-02, -5.750e-02, -1.043e-01, -3.054e-03, -9.471e-03, -6.891e-02, 2.555e-02, -3.034e-02, -1.053e-02, 1.855e-02, -2.007e-02, -5.271e-02, 6.118e-03, 1.266e-02), r1);
	r2 = MulAdd(s0_1_2, M4(-1.210e-02, -1.167e-01, 1.382e-02, 1.627e-01, -1.266e-01, -2.627e-01, 2.117e-02, -1.223e-01, -9.070e-02, 1.132e-01, -6.055e-02, -1.080e-01, -2.421e-02, 7.482e-04, 1.048e-02, -5.972e-02), r2);
	r0 = MulAdd(s0_2_0, M4(3.911e-03, 1.750e-02, 3.922e-02, -3.796e-03, -5.522e-02, 1.230e-02, -8.789e-02, 5.378e-03, 3.127e-02, 3.381e-02, 7.570e-03, 1.247e-02, -5.024e-02, 3.352e-02, -2.474e-02, 7.495e-02), r0);
	r1 = MulAdd(s0_2_0, M4(8.937e-03, 4.326e-03, 1.981e-02, 1.906e-02, -9.892e-03, 8.998e-03, -3.625e-02, -2.925e-03, 4.852e-03, 2.366e-02, 2.068e-02, -4.590e-03, 7.019e-04, 1.655e-02, -9.594e-03, -2.051e-02), r1);
	r2 = MulAdd(s0_2_0, M4(1.946e-02, -1.403e-02, -1.421e-01, -2.727e-03, 1.774e-02, -1.743e-02, 6.418e-04, -2.003e-02, -3.121e-03, -3.113e-02, -8.674e-02, -3.256e-03, -9.932e-03, 8.046e-02, -1.812e-01, -1.974e-02), r2);
	r0 = MulAdd(s0_2_1, M4(3.081e-02, 5.413e-03, 1.586e-02, -1.679e-02, -9.400e-02, -6.284e-02, 6.014e-03, -9.848e-03, 3.195e-03, -1.555e-01, 2.866e-02, 2.664e-03, 5.300e-02, 4.296e-02, -2.273e-02, 4.507e-02), r0);
	r1 = MulAdd(s0_2_1, M4(2.702e-02, 3.059e-02, 3.240e-03, 5.093e-03, -6.727e-03, -2.836e-02, -3.437e-02, -7.645e-03, 4.881e-03, 3.384e-03, -3.083e-04, -4.348e-02, 5.041e-02, 5.575e-03, -2.038e-02, 3.181e-02), r1);
	r2 = MulAdd(s0_2_1, M4(3.636e-02, 2.582e-02, -4.149e-01, 2.337e-02, 3.897e-02, -1.143e-01, 1.140e-02, -1.659e-03, 3.214e-04, -2.628e-03, 1.069e-01, 1.670e-02, 7.095e-03, 1.710e-01, -2.429e-01, 5.219e-03), r2);
	r0 = MulAdd(s0_2_2, M4(2.787e-02, -1.218e-01, -2.105e-01, 3.781e-02, -3.321e-02, -1.218e-01, -8.400e-02, -4.572e-02, -2.062e-02, -7.611e-03, 2.473e-02, -4.654e-02, 3.235e-02, 2.067e-03, 1.666e-02, 4.527e-02), r0);
	r1 = MulAdd(s0_2_2, M4(-4.722e-02, -2.555e-03, 1.898e-02, -7.150e-03, -6.128e-03, 1.068e-02, -1.393e-02, 1.358e-02, 1.644e-02, 1.067e-02, -2.074e-02, -4.161e-03, 1.391e-02, 2.161e-03, -3.010e-03, -6.476e-05), r1);
	r2 = MulAdd(s0_2_2, M4(-4.064e-02, -3.059e-02, -1.303e-01, -3.093e-02, -5.691e-03, -4.066e-03, -7.393e-02, 7.505e-03, -1.580e-02, 1.775e-03, -6.956e-02, -1.591e-02, 1.212e-02, 6.518e-02, -5.630e-02, 2.264e-02), r2);
	r0 = MulAdd(s1_0_0, M4(3.086e-02, 6.044e-02, -6.380e-02, 3.153e-02, -1.003e-02, -8.727e-03, 4.148e-03, 4.175e-03, 2.275e-02, -2.464e-02, 1.069e-01, -7.403e-03, 3.923e-04, -9.897e-02, -3.999e-02, -1.076e-02), r0);
	r1 = MulAdd(s1_0_0, M4(5.753e-02, -9.432e-03, 5.047e-02, 9.986e-02, -7.449e-03, -8.126e-03, -2.703e-02, -7.128e-02, -3.009e-02, -1.906e-02, -3.597e-03, -2.820e-02, 6.181e-02, -3.942e-02, 2.125e-02, 7.000e-02), r1);
	r2 = MulAdd(s1_0_0, M4(7.374e-02, 4.773e-02, -2.306e-02, 1.252e-02, -5.479e-02, -1.884e-02, 1.691e-02, 1.709e-02, 8.239e-02, 2.584e-02, -1.340e-02, -4.977e-02, -3.922e-02, 9.837e-03, -1.908e-02, -1.885e-02), r2);
	r0 = MulAdd(s1_0_1, M4(5.381e-02, 1.995e-01, -4.267e-01, 9.627e-02, -5.954e-02, -1.321e-01, 2.922e-02, -1.440e-01, 4.046e-02, -1.489e-01, 3.512e-02, -4.719e-03, -9.003e-04, -4.500e-02, -8.684e-02, -3.749e-02), r0);
	r1 = MulAdd(s1_0_1, M4(-1.070e-01, 4.641e-02, 4.013e-03, -8.420e-02, -8.225e-02, -8.093e-02, -8.456e-02, -2.291e-01, 4.060e-02, -5.871e-02, -3.366e-02, 1.083e-02, -1.111e-01, 2.277e-02, -3.886e-04, -6.764e-02), r1);
	r2 = MulAdd(s1_0_1, M4(7.558e-02, 1.077e-01, 4.945e-02, 1.812e-01, -1.577e-01, -1.743e-01, 2.314e-02, -2.517e-01, -9.714e-02, -1.282e-02, -2.545e-02, 3.826e-02, -1.095e-01, -1.265e-01, 2.556e-02, 5.869e-02), r2);
	r0 = MulAdd(s1_0_2, M4(9.103e-02, 8.952e-02, 3.866e-02, 7.600e-02, -1.081e-01, -1.946e-01, -4.214e-03, -1.597e-01, -8.372e-02, 6.776e-03, 1.246e-01, 1.044e-02, 5.902e-03, 6.204e-02, -6.617e-02, -1.837e-02), r0);
	r1 = MulAdd(s1_0_2, M4(4.165e-02, -1.604e-02, -4.236e-02, 6.721e-02, -1.311e-01, -4.814e-02, 4.841e-06, -1.708e-02, -2.865e-02, 1.424e-02, 1.347e-02, -7.547e-03, -1.101e-02, -2.264e-02, -2.020e-02, 1.034e-02), r1);
	r2 = MulAdd(s1_0_2, M4(6.566e-02, 5.067e-02, 3.788e-02, -1.026e-01, -9.318e-02, -1.250e-01, 1.854e-02, -3.100e-01, 4.224e-02, -3.578e-02, 3.523e-03, -1.348e-02, -3.052e-02, 1.595e-03, 1.156e-02, -1.072e-01), r2);
	r0 = MulAdd(s1_1_0, M4(-4.826e-03, 1.563e-02, 1.743e-01, -3.129e-02, -8.834e-03, -6.834e-02, -1.232e-01, 9.740e-04, -2.307e-02, 1.343e-01, -1.614e-02, -3.587e-02, 2.912e-02, 7.655e-02, 1.023e-01, -2.615e-02), r0);
	r1 = MulAdd(s1_1_0, M4(-5.634e-02, 1.706e-02, 8.134e-02, 1.256e-01, 5.752e-03, -6.141e-03, 2.510e-03, -4.749e-02, -3.770e-02, -4.273e-03, -2.957e-02, -1.401e-01, -2.186e-02, -3.266e-02, 1.800e-02, 4.675e-02), r1);
	r2 = MulAdd(s1_1_0, M4(5.656e-02, 2.103e-02, -1.542e-02, -1.761e-02, 4.098e-02, 4.337e-02, -2.259e-02, -3.915e-03, -9.476e-02, -5.481e-02, 4.218e-02, 3.457e-02, 9.828e-03, 7.507e-02, 2.726e-04, -5.194e-02), r2);
	r0 = MulAdd(s1_1_1, M4(4.795e-01, 6.816e-02, 4.811e-01, 4.776e-01, -1.241e-01, -4.053e-01, -4.209e-01, -2.466e-01, -2.505e-01, 4.076e-01, -7.223e-02, -1.150e-01, 1.489e-01, -4.823e-02, 3.995e-01, 8.227e-02), r0);
	r1 = MulAdd(s1_1_1, M4(3.047e-01, 1.218e-01, 2.891e-02, -5.873e-03, -1.634e-01, -7.559e-03, -7.886e-02, -1.715e-01, -5.829e-02, -3.403e-02, 9.464e-02, 2.227e-02, 2.013e-03, -2.198e-02, -6.740e-02, -1.181e-02), r1);
	r2 = MulAdd(s1_1_1, M4(3.447e-01, 1.530e-01, -2.682e-01, 2.229e-01, -7.700e-02, -2.464e-01, -8.120e-02, -7.553e-02, -1.184e-01, 1.224e-01, 1.276e-01, -2.222e-01, 3.412e-02, -1.485e-01, -2.091e-02, 3.864e-02), r2);
	r0 = MulAdd(s1_1_2, M4(-1.460e-02, -7.522e-02, 2.366e-01, -2.705e-01, -1.382e-01, -3.372e-01, -2.803e-01, -4.135e-01, 4.720e-02, 1.969e-01, -1.996e-02, 9.691e-04, -5.822e-03, -1.110e-01, 1.056e-01, -1.411e-01), r0);
	r1 = MulAdd(s1_1_2, M4(-2.918e-02, -1.675e-01, -2.338e-02, 1.073e-02, -1.116e-01, -1.075e-01, -4.731e-05, -3.597e-02, -3.186e-02, 1.848e-02, 2.892e-02, -1.834e-02, -6.714e-02, -9.574e-03, -2.808e-02, -1.129e-02), r1);
	r2 = MulAdd(s1_1_2, M4(6.185e-02, 1.610e-01, -4.552e-02, -6.520e-02, 3.802e-03, -3.092e-01, -1.752e-01, -2.075e-01, 8.186e-03, -9.269e-02, 7.484e-03, 4.464e-02, 1.965e-02, -1.146e-01, -4.780e-02, -3.162e-02), r2);
	r0 = MulAdd(s1_2_0, M4(-1.464e-03, 2.867e-02, 2.153e-02, 2.619e-02, 8.161e-03, 5.917e-02, 7.142e-02, -4.458e-03, -1.235e-02, -1.235e-01, -3.510e-02, -4.025e-03, 1.425e-02, -2.119e-02, -4.404e-02, -3.654e-02), r0);
	r1 = MulAdd(s1_2_0, M4(-3.063e-03, 3.779e-03, -3.588e-02, -9.254e-03, 4.911e-03, 4.424e-04, 1.341e-02, -2.043e-02, -2.959e-03, -1.245e-02, 3.236e-02, -2.628e-02, -2.706e-02, 2.034e-03, -1.953e-03, -3.398e-02), r1);
	r2 = MulAdd(s1_2_0, M4(1.393e-02, 1.281e-02, 1.345e-01, 1.494e-02, -7.099e-03, -3.379e-03, 1.050e-04, -5.785e-03, -5.815e-04, -3.439e-02, -1.386e-02, 1.174e-03, -2.508e-02, -6.662e-02, 2.586e-02, -1.334e-02), r2);
	r0 = MulAdd(s1_2_1, M4(-2.422e-03, 1.762e-03, 1.537e-01, 4.732e-03, -4.987e-05, -7.367e-04, 1.012e-01, -2.300e-02, -1.262e-02, -5.584e-02, -3.598e-02, -1.109e-01, 1.838e-02, 4.078e-02, -4.883e-02, 2.328e-02), r0);
	r1 = MulAdd(s1_2_1, M4(2.368e-02, -4.773e-02, 5.405e-02, 1.563e-02, 9.718e-03, 1.256e-02, -2.327e-02, -2.678e-02, 1.803e-02, -4.577e-02, -1.029e-02, -2.792e-02, -4.075e-02, 4.649e-02, 7.044e-03, 1.674e-02), r1);
	r2 = MulAdd(s1_2_1, M4(-3.104e-03, 7.276e-02, 1.396e-01, 5.411e-02, 1.272e-02, 4.861e-02, 4.675e-02, -2.675e-02, -3.754e-02, -2.884e-02, -1.201e-01, -6.665e-02, -5.889e-02, 7.083e-02, 8.424e-02, -5.200e-02), r2);
	r0 = MulAdd(s1_2_2, M4(-2.949e-02, 1.582e-01, -6.209e-02, -2.691e-03, 3.133e-02, 2.802e-02, 9.327e-02, -1.352e-02, -1.829e-02, -1.135e-01, -5.252e-02, 1.193e-02, 1.413e-02, 1.626e-01, -1.606e-01, -9.994e-03), r0);
	r1 = MulAdd(s1_2_2, M4(5.357e-02, -3.828e-02, 2.018e-02, 6.172e-03, -4.574e-03, 3.028e-03, 6.879e-03, -1.936e-02, -3.342e-02, 3.382e-02, -1.554e-02, -8.174e-04, -4.675e-02, 2.051e-02, 8.388e-03, -1.765e-02), r1);
	r2 = MulAdd(s1_2_2, M4(-5.915e-03, 1.033e-01, 7.604e-02, 8.160e-03, 2.664e-02, -2.366e-02, 1.364e-02, -1.858e-02, -1.601e-02, -5.542e-02, -7.658e-02, 7.924e-03, 6.480e-03, -1.109e-01, 4.260e-02, -2.556e-02), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(4.575e-02, -1.303e-02, 1.647e-02, 4.558e-02, 7.628e-03, -5.689e-02, 8.225e-04, 3.511e-02, -1.249e-02, -1.358e-01, -9.305e-02, 2.464e-02, -5.438e-03, 7.792e-02, -1.668e-02, 3.338e-03), r0);
	r1 = MulAdd(s0_0_0, M4(-8.441e-03, -5.848e-02, 3.524e-02, 3.909e-02, -1.872e-02, -4.137e-02, -8.175e-02, -2.178e-02, -1.308e-02, 6.602e-02, -9.989e-02, -9.377e-03, -1.340e-02, 1.172e-03, -2.914e-03, -9.421e-03), r1);
	r2 = MulAdd(s0_0_0, M4(-2.569e-02, 6.342e-02, -2.181e-02, 8.972e-03, -9.789e-02, 3.871e-02, -1.178e-03, 2.976e-02, -1.198e-01, 8.030e-04, 4.747e-02, 6.748e-02, 7.651e-02, -1.169e-02, 3.881e-02, 8.129e-02), r2);
	r0 = MulAdd(s0_0_1, M4(1.646e-01, -5.599e-02, 1.693e-02, -2.750e-02, 4.108e-02, -1.110e-01, 5.907e-02, 4.678e-03, -1.023e-01, -1.698e-01, -3.326e-01, -1.734e-02, -6.010e-02, 1.997e-01, -1.813e-01, -1.152e-02), r0);
	r1 = MulAdd(s0_0_1, M4(3.248e-02, -1.509e-01, -6.375e-02, 3.075e-02, 8.716e-02, 4.064e-02, -2.780e-02, 1.797e-02, -1.597e-01, -1.439e-01, -3.406e-02, -7.890e-02, -1.409e-02, 1.361e-01, -3.307e-02, -9.093e-02), r1);
	r2 = MulAdd(s0_0_1, M4(-2.140e-01, 6.902e-02, -6.323e-02, 2.530e-02, 2.653e-02, -3.867e-02, -1.353e-02, 7.301e-02, -1.531e-01, -7.432e-02, -6.258e-03, -2.466e-01, 1.024e-01, -8.151e-03, -2.193e-02, -1.013e-01), r2);
	r0 = MulAdd(s0_0_2, M4(5.234e-02, -6.414e-02, -2.460e-02, 6.617e-02, -3.850e-02, -3.153e-02, 6.890e-02, -5.428e-03, -3.260e-02, -6.421e-02, -2.691e-02, 1.642e-02, -2.241e-02, 8.234e-02, -9.737e-02, -9.072e-03), r0);
	r1 = MulAdd(s0_0_2, M4(4.433e-02, -9.558e-04, 7.463e-03, -1.860e-02, -4.637e-04, 4.377e-03, -5.919e-02, 1.210e-02, -3.575e-02, 4.992e-03, -9.182e-04, 1.632e-04, 1.850e-02, 1.971e-02, -1.203e-02, 3.781e-03), r1);
	r2 = MulAdd(s0_0_2, M4(4.936e-03, 7.935e-02, -5.899e-02, 1.034e-01, -3.919e-02, 7.293e-03, 4.211e-02, -6.380e-02, 3.799e-03, -4.941e-03, -2.778e-02, -3.050e-02, -5.322e-03, -8.648e-03, -3.967e-02, -1.065e-02), r2);
	r0 = MulAdd(s0_1_0, M4(-3.355e-02, -9.236e-02, -2.431e-02, -6.998e-03, 2.729e-02, 2.017e-01, -6.535e-02, -6.018e-02, 1.071e-01, -5.317e-03, -5.115e-02, 9.426e-02, 1.571e-01, -5.227e-02, 1.899e-01, 1.483e-01), r0);
	r1 = MulAdd(s0_1_0, M4(2.010e-02, 2.715e-02, 5.580e-02, 5.982e-02, 1.101e-01, 2.366e-02, 1.834e-01, 4.523e-01, 4.161e-02, 2.628e-02, -1.582e-01, 1.283e-01, -3.506e-02, 6.232e-02, -1.724e-01, -5.436e-02), r1);
	r2 = MulAdd(s0_1_0, M4(-1.206e-01, 3.211e-03, -3.895e-02, 1.111e-01, 1.068e-01, 1.286e-01, 1.677e-02, -5.113e-02, -8.345e-03, -3.459e-02, -1.639e-01, 5.873e-02, 2.421e-01, -2.482e-01, -1.268e-01, 5.117e-02), r2);
	r0 = MulAdd(s0_1_1, M4(-1.899e-01, 2.738e-01, -2.686e-01, -3.918e-02, 1.457e-01, 6.777e-01, -1.027e-01, 6.352e-01, -4.135e-01, -4.998e-01, -4.013e-01, -6.113e-01, -6.604e-02, -2.191e-01, 6.314e-01, -3.508e-01), r0);
	r1 = MulAdd(s0_1_1, M4(1.350e-01, -1.856e-01, 4.200e-01, 1.581e-01, 6.316e-03, 3.760e-01, 4.197e-03, 1.185e-02, -3.487e-01, -2.013e-01, -9.255e-02, -5.802e-02, 3.708e-01, -7.836e-02, -2.751e-02, 1.673e-01), r1);
	r2 = MulAdd(s0_1_1, M4(-1.733e-01, 3.196e-01, 5.139e-02, -4.092e-01, -2.125e-01, 4.265e-01, 7.144e-02, 5.443e-01, -2.156e-01, -7.291e-01, -4.405e-01, -2.054e-01, 8.668e-02, 1.603e-01, 4.263e-02, 1.288e-01), r2);
	r0 = MulAdd(s0_1_2, M4(1.198e-01, -2.055e-02, -9.637e-02, 1.353e-01, -3.969e-02, 9.572e-02, -1.411e-01, -4.163e-02, 2.984e-03, -2.090e-01, -1.206e-01, -1.126e-01, 3.327e-02, 7.082e-02, 1.840e-01, 6.029e-02), r0);
	r1 = MulAdd(s0_1_2, M4(2.336e-02, 4.718e-03, -4.423e-03, -7.106e-02, 8.021e-02, 3.360e-03, -5.629e-02, 3.820e-02, -7.984e-02, 1.774e-02, 2.236e-02, -2.164e-02, 1.110e-02, 8.130e-02, 2.679e-02, -1.682e-02), r1);
	r2 = MulAdd(s0_1_2, M4(-2.772e-02, -3.586e-02, -5.038e-02, 2.683e-01, 1.165e-02, 6.472e-02, -2.486e-02, -1.365e-02, -8.500e-03, -1.515e-01, 2.658e-02, -1.116e-01, 1.635e-02, -3.678e-02, 1.738e-02, 8.708e-02), r2);
	r0 = MulAdd(s0_2_0, M4(-3.010e-02, -3.432e-02, 6.656e-03, 2.443e-02, -1.277e-02, -2.407e-01, 9.988e-02, -1.392e-02, -4.173e-03, -2.949e-02, -1.002e-01, -4.650e-03, -3.556e-03, 2.154e-01, -5.911e-02, 7.693e-03), r0);
	r1 = MulAdd(s0_2_0, M4(2.181e-02, -4.800e-02, 3.820e-02, 7.920e-03, -8.800e-03, -6.165e-03, -1.032e-01, 1.155e-01, -2.153e-02, -2.463e-02, 1.150e-02, 6.907e-02, -1.439e-02, -4.144e-03, -2.407e-02, -5.085e-02), r1);
	r2 = MulAdd(s0_2_0, M4(3.500e-02, -3.716e-02, 1.244e-01, -1.066e-02, -7.304e-02, 2.348e-02, -2.122e-02, -1.773e-02, -5.508e-03, -3.846e-02, -1.573e-02, -1.726e-02, -4.657e-02, -3.351e-02, 6.313e-02, 4.749e-02), r2);
	r0 = MulAdd(s0_2_1, M4(8.042e-02, -3.736e-02, 2.085e-01, -7.492e-02, 1.772e-01, -4.333e-01, 2.667e-01, 1.451e-01, 3.622e-02, 4.003e-02, -2.270e-03, 4.168e-02, -5.973e-02, 1.137e-01, -1.507e-01, 7.772e-03), r0);
	r1 = MulAdd(s0_2_1, M4(-5.197e-04, -9.347e-02, 3.764e-02, -2.294e-02, 1.107e-01, 1.018e-01, -7.931e-02, -4.964e-02, -2.368e-03, 3.294e-03, -1.165e-02, -1.509e-02, -4.021e-03, -1.438e-02, -6.825e-03, -7.611e-03), r1);
	r2 = MulAdd(s0_2_1, M4(6.421e-02, 1.641e-02, 3.321e-01, 9.427e-02, 1.332e-02, 2.332e-02, 5.218e-02, 1.769e-02, 1.407e-02, -4.560e-02, 6.187e-02, 3.204e-02, -8.659e-02, 5.799e-02, 4.971e-02, -3.703e-02), r2);
	r0 = MulAdd(s0_2_2, M4(1.112e-02, -1.842e-01, -4.613e-04, 6.763e-02, 1.931e-02, 2.643e-02, -3.419e-02, 1.110e-02, -3.941e-02, -2.481e-02, 9.748e-03, -1.104e-02, -7.950e-02, -1.714e-02, 1.864e-02, -7.010e-02), r0);
	r1 = MulAdd(s0_2_2, M4(5.141e-04, 3.225e-03, -2.421e-02, -5.770e-03, -3.196e-04, 3.591e-02, -7.955e-03, 1.768e-02, 2.527e-02, 5.602e-03, -3.657e-05, -3.184e-03, -1.418e-02, -1.411e-02, 3.785e-03, -2.365e-02), r1);
	r2 = MulAdd(s0_2_2, M4(-5.580e-03, 2.147e-02, -4.131e-04, -1.341e-02, -2.178e-02, 5.021e-02, -7.742e-02, 5.077e-03, 7.879e-03, -5.747e-03, -5.748e-03, 2.104e-02, -2.098e-02, -1.082e-01, 5.710e-02, -2.693e-02), r2);
	r0 = max(r0, 0.0);
	T3[gxy] = r0;
	r1 = max(r1, 0.0);
	T4[gxy] = r1;
	r2 = max(r2, 0.0);
	T5[gxy] = r2;
}

//!PASS 5
//!DESC out-shuffle (12x12)
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, T3, T4, T5
//!OUT OUTPUT

#define L0(x, y) V4(O(T3, x, y))
#define L1(x, y) V4(O(T4, x, y))
#define L2(x, y) V4(O(T5, x, y))

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 sz = GetOutputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = ((gxy >> 1) + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(3.649e-02, 1.267e-02, 4.848e-03, 2.940e-03, -3.259e-02, 2.198e-04, -9.186e-05, 1.090e-03, 1.955e-02, 8.323e-03, 1.324e-03, 5.348e-04, 1.177e-02, -1.382e-02, 6.349e-03, 6.892e-03), r0);
	r1 = MulAdd(s0_0_0, M4(3.293e-02, 8.361e-03, 7.100e-03, 5.136e-03, -2.580e-02, 4.916e-04, 5.475e-03, 3.720e-03, 1.879e-02, 1.013e-02, -9.190e-04, 1.354e-03, 3.925e-03, -1.645e-02, 1.127e-03, 4.303e-03), r1);
	r2 = MulAdd(s0_0_0, M4(3.140e-02, 8.063e-03, 5.289e-03, 3.504e-03, -3.088e-02, -3.240e-03, 8.619e-04, 2.911e-03, 2.015e-02, 9.006e-03, 6.708e-04, 8.780e-04, 1.363e-02, -1.226e-02, 5.350e-03, 7.128e-03), r2);
	r0 = MulAdd(s0_0_1, M4(7.054e-02, 8.278e-02, -3.179e-03, 9.012e-03, -2.488e-02, -6.064e-02, -3.145e-03, -4.453e-03, 3.699e-02, 3.210e-02, -8.079e-03, -2.374e-03, 4.016e-02, -2.186e-02, 2.661e-03, 4.975e-04), r0);
	r1 = MulAdd(s0_0_1, M4(8.683e-02, 9.448e-02, -7.279e-04, 1.391e-02, -1.980e-02, -5.039e-02, -1.817e-03, -8.500e-04, 4.384e-02, 3.551e-02, -1.115e-02, -7.844e-03, 4.207e-02, -1.575e-02, 4.690e-03, 3.812e-03), r1);
	r2 = MulAdd(s0_0_1, M4(7.143e-02, 7.688e-02, -3.166e-03, 8.715e-03, -1.864e-02, -4.970e-02, 8.271e-04, -2.569e-03, 4.066e-02, 3.623e-02, -7.627e-03, -3.900e-03, 3.829e-02, -2.454e-02, 4.838e-04, 3.928e-03), r2);
	r0 = MulAdd(s0_0_2, M4(3.056e-03, 6.618e-03, 1.521e-03, 1.808e-03, -2.887e-03, 7.904e-03, 5.072e-03, 8.868e-03, 8.465e-03, 2.557e-02, 2.123e-03, -1.355e-03, -3.400e-03, -6.258e-03, -5.813e-03, 4.951e-03), r0);
	r1 = MulAdd(s0_0_2, M4(9.674e-04, 7.169e-03, -9.318e-05, 2.588e-04, -4.327e-03, 9.406e-03, 6.707e-03, 1.029e-02, 7.812e-03, 2.741e-02, 1.521e-03, -2.425e-03, -3.683e-03, -2.941e-03, -6.710e-03, 5.674e-03), r1);
	r2 = MulAdd(s0_0_2, M4(7.345e-04, 5.514e-03, 3.927e-04, 5.108e-04, -4.032e-03, 7.091e-03, 4.503e-03, 8.828e-03, 7.367e-03, 2.588e-02, 1.165e-03, -8.657e-04, -1.848e-03, -7.516e-04, -5.574e-03, 4.390e-03), r2);
	r0 = MulAdd(s0_1_0, M4(-1.274e-01, 1.575e-02, -9.426e-02, -8.019e-03, 1.084e-01, -4.380e-03, -1.091e-01, 1.305e-02, -8.959e-02, -7.260e-03, 9.644e-02, -1.216e-02, 7.315e-03, -4.327e-02, 6.519e-02, 2.265e-02), r0);
	r1 = MulAdd(s0_1_0, M4(-1.373e-01, 1.083e-02, -1.173e-01, -2.136e-02, 1.150e-01, -5.500e-03, -1.170e-01, 1.750e-02, -1.004e-01, -5.310e-03, 9.936e-02, -1.228e-02, -1.210e-03, -5.261e-02, 7.444e-02, 2.939e-02), r1);
	r2 = MulAdd(s0_1_0, M4(-1.111e-01, 1.698e-02, -9.345e-02, -1.209e-02, 1.052e-01, -4.066e-03, -1.105e-01, 1.081e-02, -9.102e-02, -5.239e-03, 9.448e-02, -7.886e-03, 6.231e-03, -4.456e-02, 6.664e-02, 2.457e-02), r2);
	r0 = MulAdd(s0_1_1, M4(4.576e-02, -2.902e-01, 2.036e-01, 3.925e-02, 2.376e-01, 3.427e-01, -2.111e-01, -3.359e-01, -2.842e-01, -2.855e-01, 2.663e-01, 3.365e-01, 6.861e-02, 7.349e-02, 4.268e-01, -5.964e-01), r0);
	r1 = MulAdd(s0_1_1, M4(4.870e-02, -3.149e-01, 2.388e-01, 4.847e-02, 2.666e-01, 3.804e-01, -2.485e-01, -3.839e-01, -3.213e-01, -3.339e-01, 3.018e-01, 3.721e-01, 8.028e-02, 9.813e-02, 5.032e-01, -6.973e-01), r1);
	r2 = MulAdd(s0_1_1, M4(3.371e-02, -2.725e-01, 1.978e-01, 3.460e-02, 2.346e-01, 3.311e-01, -2.176e-01, -3.389e-01, -2.862e-01, -2.959e-01, 2.607e-01, 3.235e-01, 6.955e-02, 7.280e-02, 4.209e-01, -5.959e-01), r2);
	r0 = MulAdd(s0_1_2, M4(8.611e-03, 1.086e-01, -2.495e-03, 3.319e-02, 7.649e-04, 4.040e-02, 7.752e-03, 4.978e-03, 1.688e-03, -9.363e-02, 2.655e-04, 4.584e-02, -1.888e-03, -1.677e-02, -1.173e-02, 8.529e-02), r0);
	r1 = MulAdd(s0_1_2, M4(8.481e-03, 1.184e-01, -6.932e-03, 3.609e-02, -1.231e-03, 3.974e-02, 1.086e-02, 4.193e-03, 6.263e-03, -9.410e-02, -2.489e-04, 4.999e-02, -4.640e-03, -1.863e-02, -1.589e-02, 1.014e-01), r1);
	r2 = MulAdd(s0_1_2, M4(8.684e-03, 1.038e-01, -4.633e-03, 3.199e-02, -7.231e-04, 4.093e-02, 9.584e-03, 3.364e-03, 3.721e-03, -9.101e-02, 1.936e-03, 4.898e-02, -3.727e-03, -2.277e-02, -1.341e-02, 9.319e-02), r2);
	r0 = MulAdd(s0_2_0, M4(2.063e-03, -9.065e-04, -1.640e-02, -7.556e-04, -8.636e-03, -3.449e-03, 1.913e-02, 6.389e-03, 9.046e-03, 5.324e-03, -2.351e-02, -4.512e-03, -3.410e-03, 6.242e-04, -1.773e-02, -5.725e-03), r0);
	r1 = MulAdd(s0_2_0, M4(5.010e-03, 5.606e-04, -1.183e-02, 1.325e-03, -1.160e-02, -2.834e-03, 1.911e-02, 8.964e-03, 1.037e-02, 4.293e-03, -2.633e-02, -1.096e-02, -4.118e-03, 1.210e-03, -2.187e-02, -5.899e-03), r1);
	r2 = MulAdd(s0_2_0, M4(4.148e-03, -5.367e-04, -6.102e-03, 2.921e-03, -8.680e-03, -2.034e-03, 2.045e-02, 6.844e-03, 1.089e-02, 5.904e-03, -2.298e-02, -8.141e-03, -3.345e-03, 1.952e-03, -1.883e-02, -5.724e-03), r2);
	r0 = MulAdd(s0_2_1, M4(2.445e-03, 2.057e-02, -6.909e-02, -8.473e-02, -1.316e-02, -1.387e-02, 5.458e-03, 2.278e-02, -2.678e-05, -3.519e-03, -4.066e-02, -4.357e-02, -7.242e-03, -2.300e-02, -2.374e-02, 1.128e-02), r0);
	r1 = MulAdd(s0_2_1, M4(4.510e-03, 2.781e-02, -8.278e-02, -8.541e-02, -1.788e-02, -2.045e-02, 8.962e-03, 2.080e-02, 3.691e-03, 2.532e-03, -2.556e-02, -2.955e-02, -1.097e-02, -2.642e-02, -2.549e-02, 1.467e-02), r1);
	r2 = MulAdd(s0_2_1, M4(3.449e-03, 2.252e-02, -6.958e-02, -7.446e-02, -1.375e-02, -1.536e-02, 1.138e-02, 2.817e-02, 7.026e-05, 8.622e-05, -3.138e-02, -3.542e-02, -9.417e-03, -2.393e-02, -2.625e-02, 1.501e-02), r2);
	r0 = MulAdd(s0_2_2, M4(3.338e-03, 2.398e-03, 7.083e-04, -2.175e-03, -8.321e-04, -4.662e-03, 2.974e-03, 4.460e-03, 4.534e-03, 9.738e-03, -3.293e-03, -3.232e-02, -2.063e-04, 1.183e-03, -9.304e-03, -1.422e-02), r0);
	r1 = MulAdd(s0_2_2, M4(1.187e-03, -5.434e-03, -1.816e-03, -1.413e-02, -1.572e-03, -6.448e-03, 5.545e-04, 3.751e-03, 4.348e-03, 1.156e-02, -7.057e-03, -3.314e-02, -2.309e-03, -2.305e-03, -1.443e-02, -2.179e-02), r1);
	r2 = MulAdd(s0_2_2, M4(3.065e-03, -1.384e-03, 1.043e-03, -7.200e-03, -9.168e-04, -4.974e-03, 1.989e-04, 2.794e-03, 4.744e-03, 8.946e-03, -6.008e-03, -3.112e-02, -4.300e-04, -1.165e-03, -1.206e-02, -2.045e-02), r2);
	r0 = MulAdd(s1_0_0, M4(2.549e-02, 4.367e-03, 5.608e-04, 1.200e-03, -1.287e-02, 5.047e-03, 8.383e-03, -1.765e-04, -2.210e-03, -3.465e-03, -2.028e-03, 1.603e-03, 4.751e-03, -1.158e-03, -3.208e-05, -3.731e-04), r0);
	r1 = MulAdd(s1_0_0, M4(2.997e-02, 3.636e-03, -8.542e-04, -2.135e-03, -1.396e-02, 4.782e-03, 8.170e-03, -3.986e-03, -6.762e-03, -5.967e-03, -7.059e-03, -3.220e-04, 4.776e-03, -6.414e-04, -5.108e-05, 9.779e-04), r1);
	r2 = MulAdd(s1_0_0, M4(2.561e-02, 2.665e-03, -1.456e-04, -2.758e-04, -9.875e-03, 6.161e-03, 7.842e-03, -2.090e-03, -3.183e-03, -3.612e-03, -4.817e-03, 1.875e-03, 4.482e-03, -1.095e-03, -4.289e-04, 4.106e-04), r2);
	r0 = MulAdd(s1_0_1, M4(9.253e-02, 6.472e-02, 4.166e-03, 3.794e-03, -4.317e-02, -7.446e-02, 2.984e-03, 1.125e-02, -6.948e-02, -6.451e-03, 6.715e-03, -1.378e-03, -3.106e-02, 3.054e-02, -9.063e-03, -1.100e-02), r0);
	r1 = MulAdd(s1_0_1, M4(1.008e-01, 7.691e-02, 5.583e-03, 5.517e-03, -6.016e-02, -8.481e-02, 4.323e-03, 1.236e-02, -8.373e-02, -1.979e-02, 3.132e-03, -7.485e-03, -3.163e-02, 3.163e-02, -1.084e-02, -1.428e-02), r1);
	r2 = MulAdd(s1_0_1, M4(8.535e-02, 6.570e-02, 8.216e-04, 4.746e-03, -4.730e-02, -6.811e-02, 5.103e-03, 1.164e-02, -7.006e-02, -1.053e-02, 6.438e-03, -3.761e-03, -2.424e-02, 2.728e-02, -5.191e-03, -1.311e-02), r2);
	r0 = MulAdd(s1_0_2, M4(4.272e-03, 1.171e-02, 1.504e-04, -1.811e-03, 7.502e-04, -7.764e-03, 1.157e-03, 2.931e-03, -3.159e-02, -8.366e-02, -1.040e-02, -1.059e-02, -8.510e-03, 1.777e-02, -1.250e-03, -6.204e-04), r0);
	r1 = MulAdd(s1_0_2, M4(9.189e-03, 9.195e-03, 3.639e-03, -7.032e-04, -3.508e-04, -1.341e-02, 9.476e-04, 6.953e-03, -3.107e-02, -9.106e-02, -1.359e-02, -1.398e-02, -1.008e-02, 2.833e-02, -2.043e-03, 1.502e-06), r1);
	r2 = MulAdd(s1_0_2, M4(4.203e-03, 3.387e-03, 1.814e-03, -3.379e-03, -1.950e-03, -1.608e-02, -9.263e-04, 2.992e-03, -2.396e-02, -7.727e-02, -9.387e-03, -9.091e-03, -6.305e-03, 2.612e-02, -2.590e-03, 8.396e-04), r2);
	r0 = MulAdd(s1_1_0, M4(1.082e-01, 1.166e-02, 3.702e-02, 8.232e-03, -1.140e-01, 3.193e-03, -4.505e-02, 1.128e-02, 5.604e-03, 9.793e-03, 4.491e-03, 1.542e-03, 6.539e-04, -1.588e-03, 9.093e-03, -1.020e-03), r0);
	r1 = MulAdd(s1_1_0, M4(1.288e-01, 1.298e-02, 5.169e-02, 1.013e-02, -1.121e-01, 5.318e-03, -3.926e-02, 1.733e-02, 1.052e-02, 1.333e-02, 9.750e-03, 4.767e-03, -1.950e-03, -2.209e-03, 8.983e-03, -1.338e-03), r1);
	r2 = MulAdd(s1_1_0, M4(1.131e-01, 9.400e-03, 4.090e-02, 6.593e-03, -1.048e-01, 3.937e-03, -3.699e-02, 1.440e-02, 5.949e-03, 1.028e-02, 7.062e-03, 2.203e-03, 2.514e-04, -2.120e-03, 9.723e-03, -1.522e-03), r2);
	r0 = MulAdd(s1_1_1, M4(-5.605e-01, 2.549e-01, 1.421e-01, 1.509e-01, 1.440e-01, -1.397e-01, -1.323e-01, -2.007e-01, 2.144e-01, 6.567e-02, 2.001e-02, 3.966e-02, -2.667e-01, -9.279e-03, -2.485e-01, 3.506e-03), r0);
	r1 = MulAdd(s1_1_1, M4(-6.581e-01, 3.040e-01, 1.626e-01, 1.810e-01, 1.394e-01, -1.806e-01, -1.812e-01, -2.401e-01, 2.417e-01, 8.400e-02, 3.161e-02, 4.871e-02, -2.973e-01, -1.338e-02, -2.783e-01, 8.701e-04), r1);
	r2 = MulAdd(s1_1_1, M4(-5.566e-01, 2.549e-01, 1.313e-01, 1.606e-01, 1.362e-01, -1.480e-01, -1.399e-01, -1.909e-01, 2.013e-01, 6.761e-02, 1.198e-02, 4.056e-02, -2.686e-01, -6.610e-03, -2.493e-01, 5.070e-03), r2);
	r0 = MulAdd(s1_1_2, M4(7.800e-03, -1.617e-02, -3.746e-02, -1.099e-01, -4.032e-05, 1.210e-01, 1.592e-03, 4.728e-02, -1.294e-01, -1.396e-02, -1.062e-01, -1.983e-01, 4.018e-03, 2.510e-01, 1.898e-02, 2.978e-01), r0);
	r1 = MulAdd(s1_1_2, M4(1.311e-02, -7.745e-03, -4.456e-02, -1.386e-01, 8.158e-03, 1.429e-01, 5.130e-03, 4.686e-02, -1.401e-01, 3.352e-03, -1.033e-01, -2.017e-01, 1.604e-02, 2.842e-01, 3.357e-02, 3.506e-01), r1);
	r2 = MulAdd(s1_1_2, M4(1.285e-02, -9.625e-03, -3.863e-02, -1.201e-01, 6.254e-03, 1.320e-01, 3.551e-03, 3.693e-02, -1.221e-01, -1.206e-02, -9.002e-02, -1.860e-01, 7.006e-03, 2.388e-01, 2.756e-02, 3.005e-01), r2);
	r0 = MulAdd(s1_2_0, M4(-8.718e-03, -1.037e-03, -1.329e-02, 1.026e-02, 1.678e-02, 4.715e-03, -4.187e-02, 5.378e-03, 3.580e-04, 6.569e-04, -4.271e-04, 4.563e-03, -2.292e-03, -2.857e-03, -5.834e-03, -2.351e-03), r0);
	r1 = MulAdd(s1_2_0, M4(-1.338e-02, -3.866e-03, -1.535e-02, 8.542e-03, 2.863e-02, 1.309e-02, -3.109e-02, 8.750e-03, -2.170e-03, -8.912e-04, -2.507e-03, 3.084e-03, -2.319e-03, -2.982e-03, -7.898e-03, -2.932e-03), r1);
	r2 = MulAdd(s1_2_0, M4(-1.270e-02, -2.163e-03, -1.487e-02, 9.038e-03, 1.595e-02, 7.522e-03, -3.844e-02, 6.488e-03, -2.174e-03, 1.938e-04, -2.853e-03, 3.200e-03, -1.832e-03, -2.598e-03, -6.107e-03, -2.845e-03), r2);
	r0 = MulAdd(s1_2_1, M4(1.824e-02, -1.558e-02, -1.185e-01, -5.481e-02, 2.240e-02, 4.109e-02, 1.802e-01, 6.715e-02, 1.371e-02, -2.255e-03, 1.213e-01, 2.423e-02, 6.824e-03, -9.594e-03, -6.067e-02, 1.666e-02), r0);
	r1 = MulAdd(s1_2_1, M4(2.769e-02, -2.107e-02, -1.284e-01, -6.497e-02, 2.953e-02, 5.188e-02, 2.017e-01, 9.009e-02, 8.118e-03, -1.213e-02, 1.233e-01, 2.002e-02, 8.622e-03, -7.615e-03, -5.872e-02, 2.021e-02), r1);
	r2 = MulAdd(s1_2_1, M4(2.472e-02, -1.890e-02, -1.135e-01, -5.086e-02, 2.424e-02, 3.811e-02, 1.755e-01, 5.875e-02, 1.317e-02, -9.620e-03, 1.145e-01, 1.699e-02, 3.954e-03, -9.550e-03, -5.993e-02, 1.558e-02), r2);
	r0 = MulAdd(s1_2_2, M4(-5.981e-04, 4.059e-03, -8.714e-03, -3.075e-03, 5.228e-04, 1.862e-03, 1.226e-02, 7.729e-02, -4.944e-04, 6.299e-03, -1.920e-03, 8.002e-02, 5.093e-03, 1.857e-02, -1.437e-02, 1.116e-02), r0);
	r1 = MulAdd(s1_2_2, M4(-3.603e-03, 4.258e-03, -9.373e-03, -7.076e-03, 2.955e-03, 6.970e-03, 2.041e-02, 9.448e-02, 5.166e-03, 1.082e-02, 1.821e-03, 9.009e-02, -1.906e-03, 1.419e-02, -2.485e-02, -2.403e-04), r1);
	r2 = MulAdd(s1_2_2, M4(-2.575e-03, 4.925e-03, -6.162e-03, 2.199e-03, 4.524e-04, 3.943e-03, 1.752e-02, 8.613e-02, 7.730e-04, 1.097e-02, -4.925e-03, 7.983e-02, 2.718e-03, 1.685e-02, -1.953e-02, 1.140e-03), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(9.599e-03, -5.016e-03, -3.751e-03, 1.423e-03, 5.928e-03, -9.551e-03, 6.516e-03, -3.685e-03, 6.215e-02, 1.666e-02, 7.244e-04, 1.196e-03, -1.016e-02, -6.131e-04, -8.095e-03, 1.289e-03), r0);
	r1 = MulAdd(s0_0_0, M4(1.171e-02, -5.513e-03, -6.610e-04, 5.199e-03, 2.755e-03, -9.847e-03, 3.484e-03, -1.294e-03, 6.665e-02, 1.713e-02, -7.451e-04, 1.562e-03, -4.163e-03, 4.523e-05, -3.950e-03, 2.652e-03), r1);
	r2 = MulAdd(s0_0_0, M4(7.309e-03, -7.294e-03, -1.819e-03, 2.978e-03, 3.303e-03, -7.906e-03, 4.938e-03, -2.163e-03, 6.717e-02, 1.636e-02, 1.155e-05, 6.079e-04, -9.176e-03, -1.222e-03, -6.094e-03, 1.064e-03), r2);
	r0 = MulAdd(s0_0_1, M4(7.154e-02, 6.619e-02, -7.220e-03, -9.588e-03, -7.292e-02, 2.706e-02, -1.423e-03, 1.664e-02, 2.802e-01, 1.987e-01, -4.608e-03, 3.766e-03, 1.110e-02, 1.368e-02, 8.975e-03, -1.317e-02), r0);
	r1 = MulAdd(s0_0_1, M4(9.001e-02, 8.132e-02, -8.479e-03, -7.574e-03, -7.056e-02, 1.567e-02, -2.455e-03, 9.497e-03, 3.154e-01, 2.305e-01, 1.472e-03, 9.256e-03, 7.881e-03, 2.082e-02, 7.745e-03, -1.118e-02), r1);
	r2 = MulAdd(s0_0_1, M4(7.677e-02, 6.519e-02, -1.016e-02, -9.067e-03, -6.601e-02, 2.142e-02, -1.993e-04, 1.292e-02, 2.686e-01, 1.968e-01, 7.674e-04, 5.211e-03, 7.613e-03, 1.816e-02, 1.001e-02, -1.292e-02), r2);
	r0 = MulAdd(s0_0_2, M4(-5.667e-03, 2.517e-02, 4.016e-03, 1.166e-03, -1.490e-03, -4.222e-02, 5.273e-03, -1.030e-02, 9.170e-03, 1.294e-01, 9.419e-03, 2.431e-03, -1.625e-03, 2.290e-02, -9.978e-04, 3.066e-03), r0);
	r1 = MulAdd(s0_0_2, M4(-8.005e-03, 2.844e-02, 5.040e-03, -2.384e-03, -4.880e-03, -4.365e-02, 4.673e-03, -1.168e-02, 9.111e-03, 1.362e-01, 6.178e-03, -1.300e-03, -7.544e-04, 2.356e-02, -4.943e-04, 1.941e-03), r1);
	r2 = MulAdd(s0_0_2, M4(-6.818e-03, 2.973e-02, 5.349e-03, -7.071e-04, -2.906e-03, -3.576e-02, 4.842e-03, -8.156e-03, 1.037e-02, 1.267e-01, 6.106e-03, 2.860e-03, -1.352e-04, 2.208e-02, -2.990e-05, 3.815e-03), r2);
	r0 = MulAdd(s0_1_0, M4(2.936e-02, -1.010e-02, 2.243e-02, -5.835e-03, 2.910e-02, -8.258e-03, 7.983e-02, -1.305e-02, -2.666e-02, -8.679e-03, -2.713e-02, -5.938e-03, 9.108e-02, 2.618e-02, 3.208e-02, -1.978e-02), r0);
	r1 = MulAdd(s0_1_0, M4(3.179e-02, -9.036e-03, 1.334e-02, -1.131e-02, 3.446e-02, -8.594e-03, 8.814e-02, -1.492e-02, -3.660e-02, -1.740e-02, -2.798e-02, -9.896e-03, 1.040e-01, 4.165e-02, 3.445e-02, -1.639e-02), r1);
	r2 = MulAdd(s0_1_0, M4(2.957e-02, -6.531e-03, 1.104e-02, -9.064e-03, 3.035e-02, -6.823e-03, 8.017e-02, -1.229e-02, -2.784e-02, -1.198e-02, -2.232e-02, -6.417e-03, 7.983e-02, 3.034e-02, 2.850e-02, -1.837e-02), r2);
	r0 = MulAdd(s0_1_1, M4(-4.597e-01, -1.145e-01, 1.234e-01, 1.240e-01, 3.720e-02, 1.274e-01, -3.776e-01, 2.271e-01, 2.878e-02, -2.172e-02, -3.037e-01, -1.997e-01, 3.805e-01, -6.354e-01, -6.330e-03, 3.320e-02), r0);
	r1 = MulAdd(s0_1_1, M4(-5.157e-01, -1.255e-01, 1.587e-01, 1.362e-01, 4.696e-02, 1.458e-01, -4.268e-01, 2.560e-01, 4.709e-02, -1.647e-02, -3.330e-01, -2.114e-01, 4.463e-01, -7.480e-01, -2.327e-02, 4.350e-02), r1);
	r2 = MulAdd(s0_1_1, M4(-4.601e-01, -1.139e-01, 1.392e-01, 1.135e-01, 3.080e-02, 1.240e-01, -3.649e-01, 2.110e-01, 3.524e-02, -1.575e-02, -3.076e-01, -1.919e-01, 3.857e-01, -6.334e-01, -7.127e-03, 2.968e-02), r2);
	r0 = MulAdd(s0_1_2, M4(3.182e-02, -2.339e-01, -1.425e-03, 2.675e-02, 5.640e-03, -5.581e-02, 2.570e-02, -2.437e-02, -1.636e-02, 1.739e-02, -1.369e-02, -1.392e-01, -1.210e-02, 4.389e-02, 1.189e-02, 2.288e-02), r0);
	r1 = MulAdd(s0_1_2, M4(3.552e-02, -2.646e-01, -8.056e-03, 3.943e-02, 1.996e-03, -7.202e-02, 2.801e-02, -1.786e-02, -2.729e-02, 1.886e-02, -1.811e-02, -1.569e-01, -1.432e-02, 4.872e-02, 1.727e-02, 2.648e-02), r1);
	r2 = MulAdd(s0_1_2, M4(3.159e-02, -2.407e-01, -8.963e-03, 3.632e-02, 3.187e-03, -6.018e-02, 2.437e-02, -1.726e-02, -2.058e-02, 1.570e-02, -1.307e-02, -1.450e-01, -1.516e-02, 4.676e-02, 1.411e-02, 2.288e-02), r2);
	r0 = MulAdd(s0_2_0, M4(-2.774e-03, -2.248e-03, -1.658e-03, -1.922e-02, -1.090e-03, -2.803e-03, 1.483e-02, -9.581e-03, -3.050e-04, 9.037e-05, -1.217e-02, -2.601e-03, -4.771e-03, 6.907e-03, 1.896e-02, -2.166e-02), r0);
	r1 = MulAdd(s0_2_0, M4(-3.349e-03, -9.704e-04, 1.633e-03, -1.654e-02, -1.312e-03, -2.742e-03, 1.736e-02, -1.029e-02, 5.028e-04, 1.465e-03, -1.308e-02, -2.366e-03, -8.290e-03, 5.539e-03, 1.058e-02, -2.458e-02), r1);
	r2 = MulAdd(s0_2_0, M4(-3.343e-03, -2.181e-03, 5.008e-03, -1.508e-02, 9.087e-04, -2.463e-03, 1.643e-02, -8.457e-03, 1.550e-04, 1.314e-03, -1.193e-02, -2.358e-03, -3.617e-03, 5.829e-03, 1.364e-02, -2.144e-02), r2);
	r0 = MulAdd(s0_2_1, M4(-2.578e-02, -1.567e-02, 1.753e-01, 1.255e-01, -1.480e-02, 1.183e-03, 2.809e-02, 3.593e-02, -9.426e-04, 6.873e-04, 2.414e-02, 3.285e-03, -3.756e-03, -3.727e-03, 2.692e-02, -4.921e-02), r0);
	r1 = MulAdd(s0_2_1, M4(-3.430e-02, -2.293e-02, 1.841e-01, 1.295e-01, -1.652e-02, 1.931e-03, 3.519e-02, 4.187e-02, -2.484e-03, -6.450e-04, 2.645e-02, 3.062e-03, -8.907e-03, -2.771e-03, 2.206e-02, -5.225e-02), r1);
	r2 = MulAdd(s0_2_1, M4(-2.411e-02, -1.503e-02, 1.623e-01, 1.246e-01, -1.543e-02, 3.884e-03, 2.570e-02, 3.866e-02, -2.513e-03, -6.077e-04, 2.472e-02, 3.677e-03, -6.050e-03, 2.919e-04, 3.325e-02, -4.911e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-8.225e-04, -1.346e-02, 2.102e-02, 9.694e-02, 3.421e-03, -5.557e-03, 9.124e-03, -9.606e-03, 2.420e-04, -1.778e-03, -1.186e-02, -1.637e-04, -4.730e-03, 2.229e-04, 3.822e-04, -9.578e-03), r0);
	r1 = MulAdd(s0_2_2, M4(-1.892e-03, -1.820e-02, 2.101e-02, 9.924e-02, 5.988e-03, -5.526e-03, 9.033e-03, -9.727e-03, 5.104e-04, -2.141e-03, -1.390e-02, -6.450e-04, -4.234e-03, 3.700e-04, 1.872e-03, -6.992e-03), r1);
	r2 = MulAdd(s0_2_2, M4(-3.970e-04, -1.282e-02, 2.228e-02, 8.927e-02, 5.372e-03, -5.608e-03, 8.705e-03, -1.259e-02, 5.027e-04, -2.396e-03, -1.222e-02, -3.329e-04, -5.052e-03, 5.618e-04, -1.378e-03, -8.540e-03), r2);
	float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
	OUTPUT[gxy + int2(0, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb + MF3(r0.x, r1.x, r2.x)), 1.0);
	OUTPUT[gxy + int2(1, 0)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb + MF3(r0.y, r1.y, r2.y)), 1.0);
	OUTPUT[gxy + int2(0, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb + MF3(r0.z, r1.z, r2.z)), 1.0);
	OUTPUT[gxy + int2(1, 1)] = MF4(saturate(INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb + MF3(r0.w, r1.w, r2.w)), 1.0);
}
